{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re, collections"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "text = \"The aims for this subject is for students to develop an understanding of the main algorithms used in naturallanguage processing, for use in a diverse range of applications including text classification, machine translation, and question answering. Topics to be covered include part-of-speech tagging, n-gram language modelling, syntactic parsing and deep learning. The programming language used is Python, see for more information on its use in the workshops, assignments and installation at home.\"\n",
    "# text = 'low '*5 +'lower '*2+'newest '*6 +'widest '*3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "defaultdict(<class 'int'>, {'T h e </w>': 2, 'a i m s </w>': 1, 'f o r </w>': 4, 't h i s </w>': 1, 's u b j e c t </w>': 1, 'i s </w>': 2, 's t u d e n t s </w>': 1, 't o </w>': 2, 'd e v e l o p </w>': 1, 'a n </w>': 1, 'u n d e r s t a n d i n g </w>': 1, 'o f </w>': 2, 't h e </w>': 2, 'm a i n </w>': 1, 'a l g o r i t h m s </w>': 1, 'u s e d </w>': 2, 'i n </w>': 3, 'n a t u r a l l a n g u a g e </w>': 1, 'p r o c e s s i n g , </w>': 1, 'u s e </w>': 2, 'a </w>': 1, 'd i v e r s e </w>': 1, 'r a n g e </w>': 1, 'a p p l i c a t i o n s </w>': 1, 'i n c l u d i n g </w>': 1, 't e x t </w>': 1, 'c l a s s i f i c a t i o n , </w>': 1, 'm a c h i n e </w>': 1, 't r a n s l a t i o n , </w>': 1, 'a n d </w>': 3, 'q u e s t i o n </w>': 1, 'a n s w e r i n g . </w>': 1, 'T o p i c s </w>': 1, 'b e </w>': 1, 'c o v e r e d </w>': 1, 'i n c l u d e </w>': 1, 'p a r t - o f - s p e e c h </w>': 1, 't a g g i n g , </w>': 1, 'n - g r a m </w>': 1, 'l a n g u a g e </w>': 2, 'm o d e l l i n g , </w>': 1, 's y n t a c t i c </w>': 1, 'p a r s i n g </w>': 1, 'd e e p </w>': 1, 'l e a r n i n g . </w>': 1, 'p r o g r a m m i n g </w>': 1, 'P y t h o n , </w>': 1, 's e e </w>': 1, 'm o r e </w>': 1, 'i n f o r m a t i o n </w>': 1, 'o n </w>': 1, 'i t s </w>': 1, 'w o r k s h o p s , </w>': 1, 'a s s i g n m e n t s </w>': 1, 'i n s t a l l a t i o n </w>': 1, 'a t </w>': 1, 'h o m e . </w>': 1})\n"
     ]
    }
   ],
   "source": [
    "'''\n",
    "先统计词频\n",
    "'''\n",
    "def get_vocab(text):\n",
    "    \n",
    "    # 初始化为 0\n",
    "    vocab = collections.defaultdict(int)\n",
    "    # 去头去尾再根据空格split\n",
    "    for word in text.strip().split():\n",
    "        #note: we use the special token </w> (instead of underscore in the lecture) to denote the end of a word\n",
    "        # 给list中每个元素增加空格，并在最后增加结束符号，同时统计单词出现次数\n",
    "        vocab[' '.join(list(word)) + ' </w>'] += 1\n",
    "    return vocab\n",
    "print(get_vocab(text))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "统计相邻字符对的频率"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "这个函数遍历词汇表中的所有单词，并计算彼此相邻的一对标记。\n",
    "\n",
    "EXAMPLE:\n",
    "    word = 'T h e <\\w>'\n",
    "    这个单词可以两两组合成： [('T', 'h'), ('h', 'e'), ('e', '<\\w>')]\n",
    "    \n",
    "输入:\n",
    "    vocab: Dict[str, int]  # vocab统计了词语出现的词频\n",
    "    \n",
    "输出:\n",
    "    pairs: Dict[Tuple[str, str], int] # 字母对，pairs统计了单词对出现的频率\n",
    "\"\"\"\n",
    "def get_stats(vocab):\n",
    "    pairs = collections.defaultdict(int)\n",
    "    \n",
    "    for word,freq in vocab.items():\n",
    "        \n",
    "        # 遍历每一个word里面的symbol，去凑所有的相邻两个内容\n",
    "        symbols = word.split()\n",
    "        for i in range(len(symbols)-1):\n",
    "            pairs[(symbols[i],symbols[i+1])] += freq\n",
    "\n",
    "    return pairs\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "开始合并高频字符对"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Vocab = defaultdict(<class 'int'>, {'T h e </w>': 2, 'a i m s </w>': 1, 'f o r </w>': 4, 't h i s </w>': 1, 's u b j e c t </w>': 1, 'i s </w>': 2, 's t u d e n t s </w>': 1, 't o </w>': 2, 'd e v e l o p </w>': 1, 'a n </w>': 1, 'u n d e r s t a n d i n g </w>': 1, 'o f </w>': 2, 't h e </w>': 2, 'm a i n </w>': 1, 'a l g o r i t h m s </w>': 1, 'u s e d </w>': 2, 'i n </w>': 3, 'n a t u r a l l a n g u a g e </w>': 1, 'p r o c e s s i n g , </w>': 1, 'u s e </w>': 2, 'a </w>': 1, 'd i v e r s e </w>': 1, 'r a n g e </w>': 1, 'a p p l i c a t i o n s </w>': 1, 'i n c l u d i n g </w>': 1, 't e x t </w>': 1, 'c l a s s i f i c a t i o n , </w>': 1, 'm a c h i n e </w>': 1, 't r a n s l a t i o n , </w>': 1, 'a n d </w>': 3, 'q u e s t i o n </w>': 1, 'a n s w e r i n g . </w>': 1, 'T o p i c s </w>': 1, 'b e </w>': 1, 'c o v e r e d </w>': 1, 'i n c l u d e </w>': 1, 'p a r t - o f - s p e e c h </w>': 1, 't a g g i n g , </w>': 1, 'n - g r a m </w>': 1, 'l a n g u a g e </w>': 2, 'm o d e l l i n g , </w>': 1, 's y n t a c t i c </w>': 1, 'p a r s i n g </w>': 1, 'd e e p </w>': 1, 'l e a r n i n g . </w>': 1, 'p r o g r a m m i n g </w>': 1, 'P y t h o n , </w>': 1, 's e e </w>': 1, 'm o r e </w>': 1, 'i n f o r m a t i o n </w>': 1, 'o n </w>': 1, 'i t s </w>': 1, 'w o r k s h o p s , </w>': 1, 'a s s i g n m e n t s </w>': 1, 'i n s t a l l a t i o n </w>': 1, 'a t </w>': 1, 'h o m e . </w>': 1})\n",
      "==========\n",
      "Tokens Before BPE\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 3, 'h': 11, 'e': 39, '</w>': 72, 'a': 38, 'i': 37, 'm': 12, 's': 34, 'f': 9, 'o': 29, 'r': 22, 't': 29, 'u': 14, 'b': 2, 'j': 1, 'c': 13, 'd': 15, 'n': 45, 'v': 3, 'l': 16, 'p': 11, 'g': 22, ',': 7, 'x': 1, 'q': 1, 'w': 2, '.': 3, '-': 3, 'y': 2, 'P': 1, 'k': 1})\n",
      "Number of tokens: 31\n",
      "==========\n",
      "Iter: 0\n",
      "Best pair: ('i', 'n')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 3, 'h': 11, 'e': 39, '</w>': 72, 'a': 38, 'i': 19, 'm': 12, 's': 34, 'f': 9, 'o': 29, 'r': 22, 't': 29, 'u': 14, 'b': 2, 'j': 1, 'c': 13, 'd': 15, 'n': 27, 'v': 3, 'l': 16, 'p': 11, 'g': 22, ',': 7, 'x': 1, 'q': 1, 'w': 2, '.': 3, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 18})\n",
      "Number of tokens: 32\n",
      "==========\n",
      "vocab,  {'T h e </w>': 2, 'a i m s </w>': 1, 'f o r </w>': 4, 't h i s </w>': 1, 's u b j e c t </w>': 1, 'i s </w>': 2, 's t u d e n t s </w>': 1, 't o </w>': 2, 'd e v e l o p </w>': 1, 'a n </w>': 1, 'u n d e r s t a n d in g </w>': 1, 'o f </w>': 2, 't h e </w>': 2, 'm a in </w>': 1, 'a l g o r i t h m s </w>': 1, 'u s e d </w>': 2, 'in </w>': 3, 'n a t u r a l l a n g u a g e </w>': 1, 'p r o c e s s in g , </w>': 1, 'u s e </w>': 2, 'a </w>': 1, 'd i v e r s e </w>': 1, 'r a n g e </w>': 1, 'a p p l i c a t i o n s </w>': 1, 'in c l u d in g </w>': 1, 't e x t </w>': 1, 'c l a s s i f i c a t i o n , </w>': 1, 'm a c h in e </w>': 1, 't r a n s l a t i o n , </w>': 1, 'a n d </w>': 3, 'q u e s t i o n </w>': 1, 'a n s w e r in g . </w>': 1, 'T o p i c s </w>': 1, 'b e </w>': 1, 'c o v e r e d </w>': 1, 'in c l u d e </w>': 1, 'p a r t - o f - s p e e c h </w>': 1, 't a g g in g , </w>': 1, 'n - g r a m </w>': 1, 'l a n g u a g e </w>': 2, 'm o d e l l in g , </w>': 1, 's y n t a c t i c </w>': 1, 'p a r s in g </w>': 1, 'd e e p </w>': 1, 'l e a r n in g . </w>': 1, 'p r o g r a m m in g </w>': 1, 'P y t h o n , </w>': 1, 's e e </w>': 1, 'm o r e </w>': 1, 'in f o r m a t i o n </w>': 1, 'o n </w>': 1, 'i t s </w>': 1, 'w o r k s h o p s , </w>': 1, 'a s s i g n m e n t s </w>': 1, 'in s t a l l a t i o n </w>': 1, 'a t </w>': 1, 'h o m e . </w>': 1}\n",
      "Iter: 1\n",
      "Best pair: ('e', '</w>')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 3, 'h': 11, 'e': 23, '</w>': 56, 'a': 38, 'i': 19, 'm': 12, 's': 34, 'f': 9, 'o': 29, 'r': 22, 't': 29, 'u': 14, 'b': 2, 'j': 1, 'c': 13, 'd': 15, 'n': 27, 'v': 3, 'l': 16, 'p': 11, 'g': 22, ',': 7, 'x': 1, 'q': 1, 'w': 2, '.': 3, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 18, 'e</w>': 16})\n",
      "Number of tokens: 33\n",
      "==========\n",
      "vocab,  {'T h e</w>': 2, 'a i m s </w>': 1, 'f o r </w>': 4, 't h i s </w>': 1, 's u b j e c t </w>': 1, 'i s </w>': 2, 's t u d e n t s </w>': 1, 't o </w>': 2, 'd e v e l o p </w>': 1, 'a n </w>': 1, 'u n d e r s t a n d in g </w>': 1, 'o f </w>': 2, 't h e</w>': 2, 'm a in </w>': 1, 'a l g o r i t h m s </w>': 1, 'u s e d </w>': 2, 'in </w>': 3, 'n a t u r a l l a n g u a g e</w>': 1, 'p r o c e s s in g , </w>': 1, 'u s e</w>': 2, 'a </w>': 1, 'd i v e r s e</w>': 1, 'r a n g e</w>': 1, 'a p p l i c a t i o n s </w>': 1, 'in c l u d in g </w>': 1, 't e x t </w>': 1, 'c l a s s i f i c a t i o n , </w>': 1, 'm a c h in e</w>': 1, 't r a n s l a t i o n , </w>': 1, 'a n d </w>': 3, 'q u e s t i o n </w>': 1, 'a n s w e r in g . </w>': 1, 'T o p i c s </w>': 1, 'b e</w>': 1, 'c o v e r e d </w>': 1, 'in c l u d e</w>': 1, 'p a r t - o f - s p e e c h </w>': 1, 't a g g in g , </w>': 1, 'n - g r a m </w>': 1, 'l a n g u a g e</w>': 2, 'm o d e l l in g , </w>': 1, 's y n t a c t i c </w>': 1, 'p a r s in g </w>': 1, 'd e e p </w>': 1, 'l e a r n in g . </w>': 1, 'p r o g r a m m in g </w>': 1, 'P y t h o n , </w>': 1, 's e e</w>': 1, 'm o r e</w>': 1, 'in f o r m a t i o n </w>': 1, 'o n </w>': 1, 'i t s </w>': 1, 'w o r k s h o p s , </w>': 1, 'a s s i g n m e n t s </w>': 1, 'in s t a l l a t i o n </w>': 1, 'a t </w>': 1, 'h o m e . </w>': 1}\n",
      "Iter: 2\n",
      "Best pair: ('a', 'n')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 3, 'h': 11, 'e': 23, '</w>': 56, 'a': 27, 'i': 19, 'm': 12, 's': 34, 'f': 9, 'o': 29, 'r': 22, 't': 29, 'u': 14, 'b': 2, 'j': 1, 'c': 13, 'd': 15, 'n': 16, 'v': 3, 'l': 16, 'p': 11, 'g': 22, ',': 7, 'x': 1, 'q': 1, 'w': 2, '.': 3, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 18, 'e</w>': 16, 'an': 11})\n",
      "Number of tokens: 34\n",
      "==========\n",
      "vocab,  {'T h e</w>': 2, 'a i m s </w>': 1, 'f o r </w>': 4, 't h i s </w>': 1, 's u b j e c t </w>': 1, 'i s </w>': 2, 's t u d e n t s </w>': 1, 't o </w>': 2, 'd e v e l o p </w>': 1, 'an </w>': 1, 'u n d e r s t an d in g </w>': 1, 'o f </w>': 2, 't h e</w>': 2, 'm a in </w>': 1, 'a l g o r i t h m s </w>': 1, 'u s e d </w>': 2, 'in </w>': 3, 'n a t u r a l l an g u a g e</w>': 1, 'p r o c e s s in g , </w>': 1, 'u s e</w>': 2, 'a </w>': 1, 'd i v e r s e</w>': 1, 'r an g e</w>': 1, 'a p p l i c a t i o n s </w>': 1, 'in c l u d in g </w>': 1, 't e x t </w>': 1, 'c l a s s i f i c a t i o n , </w>': 1, 'm a c h in e</w>': 1, 't r an s l a t i o n , </w>': 1, 'an d </w>': 3, 'q u e s t i o n </w>': 1, 'an s w e r in g . </w>': 1, 'T o p i c s </w>': 1, 'b e</w>': 1, 'c o v e r e d </w>': 1, 'in c l u d e</w>': 1, 'p a r t - o f - s p e e c h </w>': 1, 't a g g in g , </w>': 1, 'n - g r a m </w>': 1, 'l an g u a g e</w>': 2, 'm o d e l l in g , </w>': 1, 's y n t a c t i c </w>': 1, 'p a r s in g </w>': 1, 'd e e p </w>': 1, 'l e a r n in g . </w>': 1, 'p r o g r a m m in g </w>': 1, 'P y t h o n , </w>': 1, 's e e</w>': 1, 'm o r e</w>': 1, 'in f o r m a t i o n </w>': 1, 'o n </w>': 1, 'i t s </w>': 1, 'w o r k s h o p s , </w>': 1, 'a s s i g n m e n t s </w>': 1, 'in s t a l l a t i o n </w>': 1, 'a t </w>': 1, 'h o m e . </w>': 1}\n",
      "Iter: 3\n",
      "Best pair: ('s', '</w>')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 3, 'h': 11, 'e': 23, '</w>': 46, 'a': 27, 'i': 19, 'm': 12, 's': 24, 'f': 9, 'o': 29, 'r': 22, 't': 29, 'u': 14, 'b': 2, 'j': 1, 'c': 13, 'd': 15, 'n': 16, 'v': 3, 'l': 16, 'p': 11, 'g': 22, ',': 7, 'x': 1, 'q': 1, 'w': 2, '.': 3, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 18, 'e</w>': 16, 'an': 11, 's</w>': 10})\n",
      "Number of tokens: 35\n",
      "==========\n",
      "vocab,  {'T h e</w>': 2, 'a i m s</w>': 1, 'f o r </w>': 4, 't h i s</w>': 1, 's u b j e c t </w>': 1, 'i s</w>': 2, 's t u d e n t s</w>': 1, 't o </w>': 2, 'd e v e l o p </w>': 1, 'an </w>': 1, 'u n d e r s t an d in g </w>': 1, 'o f </w>': 2, 't h e</w>': 2, 'm a in </w>': 1, 'a l g o r i t h m s</w>': 1, 'u s e d </w>': 2, 'in </w>': 3, 'n a t u r a l l an g u a g e</w>': 1, 'p r o c e s s in g , </w>': 1, 'u s e</w>': 2, 'a </w>': 1, 'd i v e r s e</w>': 1, 'r an g e</w>': 1, 'a p p l i c a t i o n s</w>': 1, 'in c l u d in g </w>': 1, 't e x t </w>': 1, 'c l a s s i f i c a t i o n , </w>': 1, 'm a c h in e</w>': 1, 't r an s l a t i o n , </w>': 1, 'an d </w>': 3, 'q u e s t i o n </w>': 1, 'an s w e r in g . </w>': 1, 'T o p i c s</w>': 1, 'b e</w>': 1, 'c o v e r e d </w>': 1, 'in c l u d e</w>': 1, 'p a r t - o f - s p e e c h </w>': 1, 't a g g in g , </w>': 1, 'n - g r a m </w>': 1, 'l an g u a g e</w>': 2, 'm o d e l l in g , </w>': 1, 's y n t a c t i c </w>': 1, 'p a r s in g </w>': 1, 'd e e p </w>': 1, 'l e a r n in g . </w>': 1, 'p r o g r a m m in g </w>': 1, 'P y t h o n , </w>': 1, 's e e</w>': 1, 'm o r e</w>': 1, 'in f o r m a t i o n </w>': 1, 'o n </w>': 1, 'i t s</w>': 1, 'w o r k s h o p s , </w>': 1, 'a s s i g n m e n t s</w>': 1, 'in s t a l l a t i o n </w>': 1, 'a t </w>': 1, 'h o m e . </w>': 1}\n",
      "Iter: 4\n",
      "Best pair: ('in', 'g')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 3, 'h': 11, 'e': 23, '</w>': 46, 'a': 27, 'i': 19, 'm': 12, 's': 24, 'f': 9, 'o': 29, 'r': 22, 't': 29, 'u': 14, 'b': 2, 'j': 1, 'c': 13, 'd': 15, 'n': 16, 'v': 3, 'l': 16, 'p': 11, 'g': 13, ',': 7, 'x': 1, 'q': 1, 'w': 2, '.': 3, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 9, 'e</w>': 16, 'an': 11, 's</w>': 10, 'ing': 9})\n",
      "Number of tokens: 36\n",
      "==========\n",
      "vocab,  {'T h e</w>': 2, 'a i m s</w>': 1, 'f o r </w>': 4, 't h i s</w>': 1, 's u b j e c t </w>': 1, 'i s</w>': 2, 's t u d e n t s</w>': 1, 't o </w>': 2, 'd e v e l o p </w>': 1, 'an </w>': 1, 'u n d e r s t an d ing </w>': 1, 'o f </w>': 2, 't h e</w>': 2, 'm a in </w>': 1, 'a l g o r i t h m s</w>': 1, 'u s e d </w>': 2, 'in </w>': 3, 'n a t u r a l l an g u a g e</w>': 1, 'p r o c e s s ing , </w>': 1, 'u s e</w>': 2, 'a </w>': 1, 'd i v e r s e</w>': 1, 'r an g e</w>': 1, 'a p p l i c a t i o n s</w>': 1, 'in c l u d ing </w>': 1, 't e x t </w>': 1, 'c l a s s i f i c a t i o n , </w>': 1, 'm a c h in e</w>': 1, 't r an s l a t i o n , </w>': 1, 'an d </w>': 3, 'q u e s t i o n </w>': 1, 'an s w e r ing . </w>': 1, 'T o p i c s</w>': 1, 'b e</w>': 1, 'c o v e r e d </w>': 1, 'in c l u d e</w>': 1, 'p a r t - o f - s p e e c h </w>': 1, 't a g g ing , </w>': 1, 'n - g r a m </w>': 1, 'l an g u a g e</w>': 2, 'm o d e l l ing , </w>': 1, 's y n t a c t i c </w>': 1, 'p a r s ing </w>': 1, 'd e e p </w>': 1, 'l e a r n ing . </w>': 1, 'p r o g r a m m ing </w>': 1, 'P y t h o n , </w>': 1, 's e e</w>': 1, 'm o r e</w>': 1, 'in f o r m a t i o n </w>': 1, 'o n </w>': 1, 'i t s</w>': 1, 'w o r k s h o p s , </w>': 1, 'a s s i g n m e n t s</w>': 1, 'in s t a l l a t i o n </w>': 1, 'a t </w>': 1, 'h o m e . </w>': 1}\n",
      "Iter: 5\n",
      "Best pair: ('o', 'r')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 3, 'h': 11, 'e': 23, '</w>': 46, 'a': 27, 'i': 19, 'm': 12, 's': 24, 'f': 9, 'o': 21, 'r': 14, 't': 29, 'u': 14, 'b': 2, 'j': 1, 'c': 13, 'd': 15, 'n': 16, 'v': 3, 'l': 16, 'p': 11, 'g': 13, ',': 7, 'x': 1, 'q': 1, 'w': 2, '.': 3, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 9, 'e</w>': 16, 'an': 11, 's</w>': 10, 'ing': 9, 'or': 8})\n",
      "Number of tokens: 37\n",
      "==========\n",
      "vocab,  {'T h e</w>': 2, 'a i m s</w>': 1, 'f or </w>': 4, 't h i s</w>': 1, 's u b j e c t </w>': 1, 'i s</w>': 2, 's t u d e n t s</w>': 1, 't o </w>': 2, 'd e v e l o p </w>': 1, 'an </w>': 1, 'u n d e r s t an d ing </w>': 1, 'o f </w>': 2, 't h e</w>': 2, 'm a in </w>': 1, 'a l g or i t h m s</w>': 1, 'u s e d </w>': 2, 'in </w>': 3, 'n a t u r a l l an g u a g e</w>': 1, 'p r o c e s s ing , </w>': 1, 'u s e</w>': 2, 'a </w>': 1, 'd i v e r s e</w>': 1, 'r an g e</w>': 1, 'a p p l i c a t i o n s</w>': 1, 'in c l u d ing </w>': 1, 't e x t </w>': 1, 'c l a s s i f i c a t i o n , </w>': 1, 'm a c h in e</w>': 1, 't r an s l a t i o n , </w>': 1, 'an d </w>': 3, 'q u e s t i o n </w>': 1, 'an s w e r ing . </w>': 1, 'T o p i c s</w>': 1, 'b e</w>': 1, 'c o v e r e d </w>': 1, 'in c l u d e</w>': 1, 'p a r t - o f - s p e e c h </w>': 1, 't a g g ing , </w>': 1, 'n - g r a m </w>': 1, 'l an g u a g e</w>': 2, 'm o d e l l ing , </w>': 1, 's y n t a c t i c </w>': 1, 'p a r s ing </w>': 1, 'd e e p </w>': 1, 'l e a r n ing . </w>': 1, 'p r o g r a m m ing </w>': 1, 'P y t h o n , </w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in f or m a t i o n </w>': 1, 'o n </w>': 1, 'i t s</w>': 1, 'w or k s h o p s , </w>': 1, 'a s s i g n m e n t s</w>': 1, 'in s t a l l a t i o n </w>': 1, 'a t </w>': 1, 'h o m e . </w>': 1}\n",
      "Iter: 6\n",
      "Best pair: ('o', 'n')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 3, 'h': 11, 'e': 23, '</w>': 46, 'a': 27, 'i': 19, 'm': 12, 's': 24, 'f': 9, 'o': 13, 'r': 14, 't': 29, 'u': 14, 'b': 2, 'j': 1, 'c': 13, 'd': 15, 'n': 8, 'v': 3, 'l': 16, 'p': 11, 'g': 13, ',': 7, 'x': 1, 'q': 1, 'w': 2, '.': 3, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 9, 'e</w>': 16, 'an': 11, 's</w>': 10, 'ing': 9, 'or': 8, 'on': 8})\n",
      "Number of tokens: 38\n",
      "==========\n",
      "vocab,  {'T h e</w>': 2, 'a i m s</w>': 1, 'f or </w>': 4, 't h i s</w>': 1, 's u b j e c t </w>': 1, 'i s</w>': 2, 's t u d e n t s</w>': 1, 't o </w>': 2, 'd e v e l o p </w>': 1, 'an </w>': 1, 'u n d e r s t an d ing </w>': 1, 'o f </w>': 2, 't h e</w>': 2, 'm a in </w>': 1, 'a l g or i t h m s</w>': 1, 'u s e d </w>': 2, 'in </w>': 3, 'n a t u r a l l an g u a g e</w>': 1, 'p r o c e s s ing , </w>': 1, 'u s e</w>': 2, 'a </w>': 1, 'd i v e r s e</w>': 1, 'r an g e</w>': 1, 'a p p l i c a t i on s</w>': 1, 'in c l u d ing </w>': 1, 't e x t </w>': 1, 'c l a s s i f i c a t i on , </w>': 1, 'm a c h in e</w>': 1, 't r an s l a t i on , </w>': 1, 'an d </w>': 3, 'q u e s t i on </w>': 1, 'an s w e r ing . </w>': 1, 'T o p i c s</w>': 1, 'b e</w>': 1, 'c o v e r e d </w>': 1, 'in c l u d e</w>': 1, 'p a r t - o f - s p e e c h </w>': 1, 't a g g ing , </w>': 1, 'n - g r a m </w>': 1, 'l an g u a g e</w>': 2, 'm o d e l l ing , </w>': 1, 's y n t a c t i c </w>': 1, 'p a r s ing </w>': 1, 'd e e p </w>': 1, 'l e a r n ing . </w>': 1, 'p r o g r a m m ing </w>': 1, 'P y t h on , </w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in f or m a t i on </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h o p s , </w>': 1, 'a s s i g n m e n t s</w>': 1, 'in s t a l l a t i on </w>': 1, 'a t </w>': 1, 'h o m e . </w>': 1}\n",
      "Iter: 7\n",
      "Best pair: ('a', 't')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 3, 'h': 11, 'e': 23, '</w>': 46, 'a': 20, 'i': 19, 'm': 12, 's': 24, 'f': 9, 'o': 13, 'r': 14, 't': 22, 'u': 14, 'b': 2, 'j': 1, 'c': 13, 'd': 15, 'n': 8, 'v': 3, 'l': 16, 'p': 11, 'g': 13, ',': 7, 'x': 1, 'q': 1, 'w': 2, '.': 3, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 9, 'e</w>': 16, 'an': 11, 's</w>': 10, 'ing': 9, 'or': 8, 'on': 8, 'at': 7})\n",
      "Number of tokens: 39\n",
      "==========\n",
      "vocab,  {'T h e</w>': 2, 'a i m s</w>': 1, 'f or </w>': 4, 't h i s</w>': 1, 's u b j e c t </w>': 1, 'i s</w>': 2, 's t u d e n t s</w>': 1, 't o </w>': 2, 'd e v e l o p </w>': 1, 'an </w>': 1, 'u n d e r s t an d ing </w>': 1, 'o f </w>': 2, 't h e</w>': 2, 'm a in </w>': 1, 'a l g or i t h m s</w>': 1, 'u s e d </w>': 2, 'in </w>': 3, 'n at u r a l l an g u a g e</w>': 1, 'p r o c e s s ing , </w>': 1, 'u s e</w>': 2, 'a </w>': 1, 'd i v e r s e</w>': 1, 'r an g e</w>': 1, 'a p p l i c at i on s</w>': 1, 'in c l u d ing </w>': 1, 't e x t </w>': 1, 'c l a s s i f i c at i on , </w>': 1, 'm a c h in e</w>': 1, 't r an s l at i on , </w>': 1, 'an d </w>': 3, 'q u e s t i on </w>': 1, 'an s w e r ing . </w>': 1, 'T o p i c s</w>': 1, 'b e</w>': 1, 'c o v e r e d </w>': 1, 'in c l u d e</w>': 1, 'p a r t - o f - s p e e c h </w>': 1, 't a g g ing , </w>': 1, 'n - g r a m </w>': 1, 'l an g u a g e</w>': 2, 'm o d e l l ing , </w>': 1, 's y n t a c t i c </w>': 1, 'p a r s ing </w>': 1, 'd e e p </w>': 1, 'l e a r n ing . </w>': 1, 'p r o g r a m m ing </w>': 1, 'P y t h on , </w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in f or m at i on </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h o p s , </w>': 1, 'a s s i g n m e n t s</w>': 1, 'in s t a l l at i on </w>': 1, 'at </w>': 1, 'h o m e . </w>': 1}\n",
      "Iter: 8\n",
      "Best pair: (',', '</w>')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 3, 'h': 11, 'e': 23, '</w>': 39, 'a': 20, 'i': 19, 'm': 12, 's': 24, 'f': 9, 'o': 13, 'r': 14, 't': 22, 'u': 14, 'b': 2, 'j': 1, 'c': 13, 'd': 15, 'n': 8, 'v': 3, 'l': 16, 'p': 11, 'g': 13, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 3, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 9, 'e</w>': 16, 'an': 11, 's</w>': 10, 'ing': 9, 'or': 8, 'on': 8, 'at': 7, ',</w>': 7})\n",
      "Number of tokens: 40\n",
      "==========\n",
      "vocab,  {'T h e</w>': 2, 'a i m s</w>': 1, 'f or </w>': 4, 't h i s</w>': 1, 's u b j e c t </w>': 1, 'i s</w>': 2, 's t u d e n t s</w>': 1, 't o </w>': 2, 'd e v e l o p </w>': 1, 'an </w>': 1, 'u n d e r s t an d ing </w>': 1, 'o f </w>': 2, 't h e</w>': 2, 'm a in </w>': 1, 'a l g or i t h m s</w>': 1, 'u s e d </w>': 2, 'in </w>': 3, 'n at u r a l l an g u a g e</w>': 1, 'p r o c e s s ing ,</w>': 1, 'u s e</w>': 2, 'a </w>': 1, 'd i v e r s e</w>': 1, 'r an g e</w>': 1, 'a p p l i c at i on s</w>': 1, 'in c l u d ing </w>': 1, 't e x t </w>': 1, 'c l a s s i f i c at i on ,</w>': 1, 'm a c h in e</w>': 1, 't r an s l at i on ,</w>': 1, 'an d </w>': 3, 'q u e s t i on </w>': 1, 'an s w e r ing . </w>': 1, 'T o p i c s</w>': 1, 'b e</w>': 1, 'c o v e r e d </w>': 1, 'in c l u d e</w>': 1, 'p a r t - o f - s p e e c h </w>': 1, 't a g g ing ,</w>': 1, 'n - g r a m </w>': 1, 'l an g u a g e</w>': 2, 'm o d e l l ing ,</w>': 1, 's y n t a c t i c </w>': 1, 'p a r s ing </w>': 1, 'd e e p </w>': 1, 'l e a r n ing . </w>': 1, 'p r o g r a m m ing </w>': 1, 'P y t h on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in f or m at i on </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h o p s ,</w>': 1, 'a s s i g n m e n t s</w>': 1, 'in s t a l l at i on </w>': 1, 'at </w>': 1, 'h o m e . </w>': 1}\n",
      "Iter: 9\n",
      "Best pair: ('d', '</w>')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 3, 'h': 11, 'e': 23, '</w>': 33, 'a': 20, 'i': 19, 'm': 12, 's': 24, 'f': 9, 'o': 13, 'r': 14, 't': 22, 'u': 14, 'b': 2, 'j': 1, 'c': 13, 'd': 9, 'n': 8, 'v': 3, 'l': 16, 'p': 11, 'g': 13, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 3, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 9, 'e</w>': 16, 'an': 11, 's</w>': 10, 'ing': 9, 'or': 8, 'on': 8, 'at': 7, ',</w>': 7, 'd</w>': 6})\n",
      "Number of tokens: 41\n",
      "==========\n",
      "vocab,  {'T h e</w>': 2, 'a i m s</w>': 1, 'f or </w>': 4, 't h i s</w>': 1, 's u b j e c t </w>': 1, 'i s</w>': 2, 's t u d e n t s</w>': 1, 't o </w>': 2, 'd e v e l o p </w>': 1, 'an </w>': 1, 'u n d e r s t an d ing </w>': 1, 'o f </w>': 2, 't h e</w>': 2, 'm a in </w>': 1, 'a l g or i t h m s</w>': 1, 'u s e d</w>': 2, 'in </w>': 3, 'n at u r a l l an g u a g e</w>': 1, 'p r o c e s s ing ,</w>': 1, 'u s e</w>': 2, 'a </w>': 1, 'd i v e r s e</w>': 1, 'r an g e</w>': 1, 'a p p l i c at i on s</w>': 1, 'in c l u d ing </w>': 1, 't e x t </w>': 1, 'c l a s s i f i c at i on ,</w>': 1, 'm a c h in e</w>': 1, 't r an s l at i on ,</w>': 1, 'an d</w>': 3, 'q u e s t i on </w>': 1, 'an s w e r ing . </w>': 1, 'T o p i c s</w>': 1, 'b e</w>': 1, 'c o v e r e d</w>': 1, 'in c l u d e</w>': 1, 'p a r t - o f - s p e e c h </w>': 1, 't a g g ing ,</w>': 1, 'n - g r a m </w>': 1, 'l an g u a g e</w>': 2, 'm o d e l l ing ,</w>': 1, 's y n t a c t i c </w>': 1, 'p a r s ing </w>': 1, 'd e e p </w>': 1, 'l e a r n ing . </w>': 1, 'p r o g r a m m ing </w>': 1, 'P y t h on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in f or m at i on </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h o p s ,</w>': 1, 'a s s i g n m e n t s</w>': 1, 'in s t a l l at i on </w>': 1, 'at </w>': 1, 'h o m e . </w>': 1}\n",
      "Iter: 10\n",
      "Best pair: ('i', 'on')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 3, 'h': 11, 'e': 23, '</w>': 33, 'a': 20, 'i': 13, 'm': 12, 's': 24, 'f': 9, 'o': 13, 'r': 14, 't': 22, 'u': 14, 'b': 2, 'j': 1, 'c': 13, 'd': 9, 'n': 8, 'v': 3, 'l': 16, 'p': 11, 'g': 13, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 3, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 9, 'e</w>': 16, 'an': 11, 's</w>': 10, 'ing': 9, 'or': 8, 'on': 2, 'at': 7, ',</w>': 7, 'd</w>': 6, 'ion': 6})\n",
      "Number of tokens: 42\n",
      "==========\n",
      "vocab,  {'T h e</w>': 2, 'a i m s</w>': 1, 'f or </w>': 4, 't h i s</w>': 1, 's u b j e c t </w>': 1, 'i s</w>': 2, 's t u d e n t s</w>': 1, 't o </w>': 2, 'd e v e l o p </w>': 1, 'an </w>': 1, 'u n d e r s t an d ing </w>': 1, 'o f </w>': 2, 't h e</w>': 2, 'm a in </w>': 1, 'a l g or i t h m s</w>': 1, 'u s e d</w>': 2, 'in </w>': 3, 'n at u r a l l an g u a g e</w>': 1, 'p r o c e s s ing ,</w>': 1, 'u s e</w>': 2, 'a </w>': 1, 'd i v e r s e</w>': 1, 'r an g e</w>': 1, 'a p p l i c at ion s</w>': 1, 'in c l u d ing </w>': 1, 't e x t </w>': 1, 'c l a s s i f i c at ion ,</w>': 1, 'm a c h in e</w>': 1, 't r an s l at ion ,</w>': 1, 'an d</w>': 3, 'q u e s t ion </w>': 1, 'an s w e r ing . </w>': 1, 'T o p i c s</w>': 1, 'b e</w>': 1, 'c o v e r e d</w>': 1, 'in c l u d e</w>': 1, 'p a r t - o f - s p e e c h </w>': 1, 't a g g ing ,</w>': 1, 'n - g r a m </w>': 1, 'l an g u a g e</w>': 2, 'm o d e l l ing ,</w>': 1, 's y n t a c t i c </w>': 1, 'p a r s ing </w>': 1, 'd e e p </w>': 1, 'l e a r n ing . </w>': 1, 'p r o g r a m m ing </w>': 1, 'P y t h on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in f or m at ion </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h o p s ,</w>': 1, 'a s s i g n m e n t s</w>': 1, 'in s t a l l at ion </w>': 1, 'at </w>': 1, 'h o m e . </w>': 1}\n",
      "Iter: 11\n",
      "Best pair: ('f', 'or')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 3, 'h': 11, 'e': 23, '</w>': 33, 'a': 20, 'i': 13, 'm': 12, 's': 24, 'f': 4, 'o': 13, 'r': 14, 't': 22, 'u': 14, 'b': 2, 'j': 1, 'c': 13, 'd': 9, 'n': 8, 'v': 3, 'l': 16, 'p': 11, 'g': 13, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 3, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 9, 'e</w>': 16, 'an': 11, 's</w>': 10, 'ing': 9, 'or': 3, 'on': 2, 'at': 7, ',</w>': 7, 'd</w>': 6, 'ion': 6, 'for': 5})\n",
      "Number of tokens: 43\n",
      "==========\n",
      "vocab,  {'T h e</w>': 2, 'a i m s</w>': 1, 'for </w>': 4, 't h i s</w>': 1, 's u b j e c t </w>': 1, 'i s</w>': 2, 's t u d e n t s</w>': 1, 't o </w>': 2, 'd e v e l o p </w>': 1, 'an </w>': 1, 'u n d e r s t an d ing </w>': 1, 'o f </w>': 2, 't h e</w>': 2, 'm a in </w>': 1, 'a l g or i t h m s</w>': 1, 'u s e d</w>': 2, 'in </w>': 3, 'n at u r a l l an g u a g e</w>': 1, 'p r o c e s s ing ,</w>': 1, 'u s e</w>': 2, 'a </w>': 1, 'd i v e r s e</w>': 1, 'r an g e</w>': 1, 'a p p l i c at ion s</w>': 1, 'in c l u d ing </w>': 1, 't e x t </w>': 1, 'c l a s s i f i c at ion ,</w>': 1, 'm a c h in e</w>': 1, 't r an s l at ion ,</w>': 1, 'an d</w>': 3, 'q u e s t ion </w>': 1, 'an s w e r ing . </w>': 1, 'T o p i c s</w>': 1, 'b e</w>': 1, 'c o v e r e d</w>': 1, 'in c l u d e</w>': 1, 'p a r t - o f - s p e e c h </w>': 1, 't a g g ing ,</w>': 1, 'n - g r a m </w>': 1, 'l an g u a g e</w>': 2, 'm o d e l l ing ,</w>': 1, 's y n t a c t i c </w>': 1, 'p a r s ing </w>': 1, 'd e e p </w>': 1, 'l e a r n ing . </w>': 1, 'p r o g r a m m ing </w>': 1, 'P y t h on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m at ion </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h o p s ,</w>': 1, 'a s s i g n m e n t s</w>': 1, 'in s t a l l at ion </w>': 1, 'at </w>': 1, 'h o m e . </w>': 1}\n",
      "Iter: 12\n",
      "Best pair: ('t', 'h')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 3, 'h': 6, 'e': 23, '</w>': 33, 'a': 20, 'i': 13, 'm': 12, 's': 24, 'f': 4, 'o': 13, 'r': 14, 't': 17, 'u': 14, 'b': 2, 'j': 1, 'c': 13, 'd': 9, 'n': 8, 'v': 3, 'l': 16, 'p': 11, 'g': 13, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 3, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 9, 'e</w>': 16, 'an': 11, 's</w>': 10, 'ing': 9, 'or': 3, 'on': 2, 'at': 7, ',</w>': 7, 'd</w>': 6, 'ion': 6, 'for': 5, 'th': 5})\n",
      "Number of tokens: 44\n",
      "==========\n",
      "vocab,  {'T h e</w>': 2, 'a i m s</w>': 1, 'for </w>': 4, 'th i s</w>': 1, 's u b j e c t </w>': 1, 'i s</w>': 2, 's t u d e n t s</w>': 1, 't o </w>': 2, 'd e v e l o p </w>': 1, 'an </w>': 1, 'u n d e r s t an d ing </w>': 1, 'o f </w>': 2, 'th e</w>': 2, 'm a in </w>': 1, 'a l g or i th m s</w>': 1, 'u s e d</w>': 2, 'in </w>': 3, 'n at u r a l l an g u a g e</w>': 1, 'p r o c e s s ing ,</w>': 1, 'u s e</w>': 2, 'a </w>': 1, 'd i v e r s e</w>': 1, 'r an g e</w>': 1, 'a p p l i c at ion s</w>': 1, 'in c l u d ing </w>': 1, 't e x t </w>': 1, 'c l a s s i f i c at ion ,</w>': 1, 'm a c h in e</w>': 1, 't r an s l at ion ,</w>': 1, 'an d</w>': 3, 'q u e s t ion </w>': 1, 'an s w e r ing . </w>': 1, 'T o p i c s</w>': 1, 'b e</w>': 1, 'c o v e r e d</w>': 1, 'in c l u d e</w>': 1, 'p a r t - o f - s p e e c h </w>': 1, 't a g g ing ,</w>': 1, 'n - g r a m </w>': 1, 'l an g u a g e</w>': 2, 'm o d e l l ing ,</w>': 1, 's y n t a c t i c </w>': 1, 'p a r s ing </w>': 1, 'd e e p </w>': 1, 'l e a r n ing . </w>': 1, 'p r o g r a m m ing </w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m at ion </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h o p s ,</w>': 1, 'a s s i g n m e n t s</w>': 1, 'in s t a l l at ion </w>': 1, 'at </w>': 1, 'h o m e . </w>': 1}\n",
      "Iter: 13\n",
      "Best pair: ('d', 'e')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 3, 'h': 6, 'e': 18, '</w>': 33, 'a': 20, 'i': 13, 'm': 12, 's': 24, 'f': 4, 'o': 13, 'r': 14, 't': 17, 'u': 14, 'b': 2, 'j': 1, 'c': 13, 'd': 4, 'n': 8, 'v': 3, 'l': 16, 'p': 11, 'g': 13, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 3, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 9, 'e</w>': 16, 'an': 11, 's</w>': 10, 'ing': 9, 'or': 3, 'on': 2, 'at': 7, ',</w>': 7, 'd</w>': 6, 'ion': 6, 'for': 5, 'th': 5, 'de': 5})\n",
      "Number of tokens: 45\n",
      "==========\n",
      "vocab,  {'T h e</w>': 2, 'a i m s</w>': 1, 'for </w>': 4, 'th i s</w>': 1, 's u b j e c t </w>': 1, 'i s</w>': 2, 's t u de n t s</w>': 1, 't o </w>': 2, 'de v e l o p </w>': 1, 'an </w>': 1, 'u n de r s t an d ing </w>': 1, 'o f </w>': 2, 'th e</w>': 2, 'm a in </w>': 1, 'a l g or i th m s</w>': 1, 'u s e d</w>': 2, 'in </w>': 3, 'n at u r a l l an g u a g e</w>': 1, 'p r o c e s s ing ,</w>': 1, 'u s e</w>': 2, 'a </w>': 1, 'd i v e r s e</w>': 1, 'r an g e</w>': 1, 'a p p l i c at ion s</w>': 1, 'in c l u d ing </w>': 1, 't e x t </w>': 1, 'c l a s s i f i c at ion ,</w>': 1, 'm a c h in e</w>': 1, 't r an s l at ion ,</w>': 1, 'an d</w>': 3, 'q u e s t ion </w>': 1, 'an s w e r ing . </w>': 1, 'T o p i c s</w>': 1, 'b e</w>': 1, 'c o v e r e d</w>': 1, 'in c l u d e</w>': 1, 'p a r t - o f - s p e e c h </w>': 1, 't a g g ing ,</w>': 1, 'n - g r a m </w>': 1, 'l an g u a g e</w>': 2, 'm o de l l ing ,</w>': 1, 's y n t a c t i c </w>': 1, 'p a r s ing </w>': 1, 'de e p </w>': 1, 'l e a r n ing . </w>': 1, 'p r o g r a m m ing </w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m at ion </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h o p s ,</w>': 1, 'a s s i g n m e n t s</w>': 1, 'in s t a l l at ion </w>': 1, 'at </w>': 1, 'h o m e . </w>': 1}\n",
      "Iter: 14\n",
      "Best pair: ('at', 'ion')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 3, 'h': 6, 'e': 18, '</w>': 33, 'a': 20, 'i': 13, 'm': 12, 's': 24, 'f': 4, 'o': 13, 'r': 14, 't': 17, 'u': 14, 'b': 2, 'j': 1, 'c': 13, 'd': 4, 'n': 8, 'v': 3, 'l': 16, 'p': 11, 'g': 13, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 3, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 9, 'e</w>': 16, 'an': 11, 's</w>': 10, 'ing': 9, 'or': 3, 'on': 2, 'at': 2, ',</w>': 7, 'd</w>': 6, 'ion': 1, 'for': 5, 'th': 5, 'de': 5, 'ation': 5})\n",
      "Number of tokens: 46\n",
      "==========\n",
      "vocab,  {'T h e</w>': 2, 'a i m s</w>': 1, 'for </w>': 4, 'th i s</w>': 1, 's u b j e c t </w>': 1, 'i s</w>': 2, 's t u de n t s</w>': 1, 't o </w>': 2, 'de v e l o p </w>': 1, 'an </w>': 1, 'u n de r s t an d ing </w>': 1, 'o f </w>': 2, 'th e</w>': 2, 'm a in </w>': 1, 'a l g or i th m s</w>': 1, 'u s e d</w>': 2, 'in </w>': 3, 'n at u r a l l an g u a g e</w>': 1, 'p r o c e s s ing ,</w>': 1, 'u s e</w>': 2, 'a </w>': 1, 'd i v e r s e</w>': 1, 'r an g e</w>': 1, 'a p p l i c ation s</w>': 1, 'in c l u d ing </w>': 1, 't e x t </w>': 1, 'c l a s s i f i c ation ,</w>': 1, 'm a c h in e</w>': 1, 't r an s l ation ,</w>': 1, 'an d</w>': 3, 'q u e s t ion </w>': 1, 'an s w e r ing . </w>': 1, 'T o p i c s</w>': 1, 'b e</w>': 1, 'c o v e r e d</w>': 1, 'in c l u d e</w>': 1, 'p a r t - o f - s p e e c h </w>': 1, 't a g g ing ,</w>': 1, 'n - g r a m </w>': 1, 'l an g u a g e</w>': 2, 'm o de l l ing ,</w>': 1, 's y n t a c t i c </w>': 1, 'p a r s ing </w>': 1, 'de e p </w>': 1, 'l e a r n ing . </w>': 1, 'p r o g r a m m ing </w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h o p s ,</w>': 1, 'a s s i g n m e n t s</w>': 1, 'in s t a l l ation </w>': 1, 'at </w>': 1, 'h o m e . </w>': 1}\n",
      "Iter: 15\n",
      "Best pair: ('for', '</w>')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 3, 'h': 6, 'e': 18, '</w>': 29, 'a': 20, 'i': 13, 'm': 12, 's': 24, 'f': 4, 'o': 13, 'r': 14, 't': 17, 'u': 14, 'b': 2, 'j': 1, 'c': 13, 'd': 4, 'n': 8, 'v': 3, 'l': 16, 'p': 11, 'g': 13, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 3, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 9, 'e</w>': 16, 'an': 11, 's</w>': 10, 'ing': 9, 'or': 3, 'on': 2, 'at': 2, ',</w>': 7, 'd</w>': 6, 'ion': 1, 'for': 1, 'th': 5, 'de': 5, 'ation': 5, 'for</w>': 4})\n",
      "Number of tokens: 47\n",
      "==========\n",
      "vocab,  {'T h e</w>': 2, 'a i m s</w>': 1, 'for</w>': 4, 'th i s</w>': 1, 's u b j e c t </w>': 1, 'i s</w>': 2, 's t u de n t s</w>': 1, 't o </w>': 2, 'de v e l o p </w>': 1, 'an </w>': 1, 'u n de r s t an d ing </w>': 1, 'o f </w>': 2, 'th e</w>': 2, 'm a in </w>': 1, 'a l g or i th m s</w>': 1, 'u s e d</w>': 2, 'in </w>': 3, 'n at u r a l l an g u a g e</w>': 1, 'p r o c e s s ing ,</w>': 1, 'u s e</w>': 2, 'a </w>': 1, 'd i v e r s e</w>': 1, 'r an g e</w>': 1, 'a p p l i c ation s</w>': 1, 'in c l u d ing </w>': 1, 't e x t </w>': 1, 'c l a s s i f i c ation ,</w>': 1, 'm a c h in e</w>': 1, 't r an s l ation ,</w>': 1, 'an d</w>': 3, 'q u e s t ion </w>': 1, 'an s w e r ing . </w>': 1, 'T o p i c s</w>': 1, 'b e</w>': 1, 'c o v e r e d</w>': 1, 'in c l u d e</w>': 1, 'p a r t - o f - s p e e c h </w>': 1, 't a g g ing ,</w>': 1, 'n - g r a m </w>': 1, 'l an g u a g e</w>': 2, 'm o de l l ing ,</w>': 1, 's y n t a c t i c </w>': 1, 'p a r s ing </w>': 1, 'de e p </w>': 1, 'l e a r n ing . </w>': 1, 'p r o g r a m m ing </w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h o p s ,</w>': 1, 'a s s i g n m e n t s</w>': 1, 'in s t a l l ation </w>': 1, 'at </w>': 1, 'h o m e . </w>': 1}\n",
      "Iter: 16\n",
      "Best pair: ('s', 't')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 3, 'h': 6, 'e': 18, '</w>': 29, 'a': 20, 'i': 13, 'm': 12, 's': 20, 'f': 4, 'o': 13, 'r': 14, 't': 13, 'u': 14, 'b': 2, 'j': 1, 'c': 13, 'd': 4, 'n': 8, 'v': 3, 'l': 16, 'p': 11, 'g': 13, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 3, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 9, 'e</w>': 16, 'an': 11, 's</w>': 10, 'ing': 9, 'or': 3, 'on': 2, 'at': 2, ',</w>': 7, 'd</w>': 6, 'ion': 1, 'for': 1, 'th': 5, 'de': 5, 'ation': 5, 'for</w>': 4, 'st': 4})\n",
      "Number of tokens: 48\n",
      "==========\n",
      "vocab,  {'T h e</w>': 2, 'a i m s</w>': 1, 'for</w>': 4, 'th i s</w>': 1, 's u b j e c t </w>': 1, 'i s</w>': 2, 'st u de n t s</w>': 1, 't o </w>': 2, 'de v e l o p </w>': 1, 'an </w>': 1, 'u n de r st an d ing </w>': 1, 'o f </w>': 2, 'th e</w>': 2, 'm a in </w>': 1, 'a l g or i th m s</w>': 1, 'u s e d</w>': 2, 'in </w>': 3, 'n at u r a l l an g u a g e</w>': 1, 'p r o c e s s ing ,</w>': 1, 'u s e</w>': 2, 'a </w>': 1, 'd i v e r s e</w>': 1, 'r an g e</w>': 1, 'a p p l i c ation s</w>': 1, 'in c l u d ing </w>': 1, 't e x t </w>': 1, 'c l a s s i f i c ation ,</w>': 1, 'm a c h in e</w>': 1, 't r an s l ation ,</w>': 1, 'an d</w>': 3, 'q u e st ion </w>': 1, 'an s w e r ing . </w>': 1, 'T o p i c s</w>': 1, 'b e</w>': 1, 'c o v e r e d</w>': 1, 'in c l u d e</w>': 1, 'p a r t - o f - s p e e c h </w>': 1, 't a g g ing ,</w>': 1, 'n - g r a m </w>': 1, 'l an g u a g e</w>': 2, 'm o de l l ing ,</w>': 1, 's y n t a c t i c </w>': 1, 'p a r s ing </w>': 1, 'de e p </w>': 1, 'l e a r n ing . </w>': 1, 'p r o g r a m m ing </w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h o p s ,</w>': 1, 'a s s i g n m e n t s</w>': 1, 'in st a l l ation </w>': 1, 'at </w>': 1, 'h o m e . </w>': 1}\n",
      "Iter: 17\n",
      "Best pair: ('ing', '</w>')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 3, 'h': 6, 'e': 18, '</w>': 25, 'a': 20, 'i': 13, 'm': 12, 's': 20, 'f': 4, 'o': 13, 'r': 14, 't': 13, 'u': 14, 'b': 2, 'j': 1, 'c': 13, 'd': 4, 'n': 8, 'v': 3, 'l': 16, 'p': 11, 'g': 13, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 3, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 9, 'e</w>': 16, 'an': 11, 's</w>': 10, 'ing': 5, 'or': 3, 'on': 2, 'at': 2, ',</w>': 7, 'd</w>': 6, 'ion': 1, 'for': 1, 'th': 5, 'de': 5, 'ation': 5, 'for</w>': 4, 'st': 4, 'ing</w>': 4})\n",
      "Number of tokens: 49\n",
      "==========\n",
      "vocab,  {'T h e</w>': 2, 'a i m s</w>': 1, 'for</w>': 4, 'th i s</w>': 1, 's u b j e c t </w>': 1, 'i s</w>': 2, 'st u de n t s</w>': 1, 't o </w>': 2, 'de v e l o p </w>': 1, 'an </w>': 1, 'u n de r st an d ing</w>': 1, 'o f </w>': 2, 'th e</w>': 2, 'm a in </w>': 1, 'a l g or i th m s</w>': 1, 'u s e d</w>': 2, 'in </w>': 3, 'n at u r a l l an g u a g e</w>': 1, 'p r o c e s s ing ,</w>': 1, 'u s e</w>': 2, 'a </w>': 1, 'd i v e r s e</w>': 1, 'r an g e</w>': 1, 'a p p l i c ation s</w>': 1, 'in c l u d ing</w>': 1, 't e x t </w>': 1, 'c l a s s i f i c ation ,</w>': 1, 'm a c h in e</w>': 1, 't r an s l ation ,</w>': 1, 'an d</w>': 3, 'q u e st ion </w>': 1, 'an s w e r ing . </w>': 1, 'T o p i c s</w>': 1, 'b e</w>': 1, 'c o v e r e d</w>': 1, 'in c l u d e</w>': 1, 'p a r t - o f - s p e e c h </w>': 1, 't a g g ing ,</w>': 1, 'n - g r a m </w>': 1, 'l an g u a g e</w>': 2, 'm o de l l ing ,</w>': 1, 's y n t a c t i c </w>': 1, 'p a r s ing</w>': 1, 'de e p </w>': 1, 'l e a r n ing . </w>': 1, 'p r o g r a m m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h o p s ,</w>': 1, 'a s s i g n m e n t s</w>': 1, 'in st a l l ation </w>': 1, 'at </w>': 1, 'h o m e . </w>': 1}\n",
      "Iter: 18\n",
      "Best pair: ('in', '</w>')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 3, 'h': 6, 'e': 18, '</w>': 21, 'a': 20, 'i': 13, 'm': 12, 's': 20, 'f': 4, 'o': 13, 'r': 14, 't': 13, 'u': 14, 'b': 2, 'j': 1, 'c': 13, 'd': 4, 'n': 8, 'v': 3, 'l': 16, 'p': 11, 'g': 13, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 3, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 5, 'e</w>': 16, 'an': 11, 's</w>': 10, 'ing': 5, 'or': 3, 'on': 2, 'at': 2, ',</w>': 7, 'd</w>': 6, 'ion': 1, 'for': 1, 'th': 5, 'de': 5, 'ation': 5, 'for</w>': 4, 'st': 4, 'ing</w>': 4, 'in</w>': 4})\n",
      "Number of tokens: 50\n",
      "==========\n",
      "vocab,  {'T h e</w>': 2, 'a i m s</w>': 1, 'for</w>': 4, 'th i s</w>': 1, 's u b j e c t </w>': 1, 'i s</w>': 2, 'st u de n t s</w>': 1, 't o </w>': 2, 'de v e l o p </w>': 1, 'an </w>': 1, 'u n de r st an d ing</w>': 1, 'o f </w>': 2, 'th e</w>': 2, 'm a in</w>': 1, 'a l g or i th m s</w>': 1, 'u s e d</w>': 2, 'in</w>': 3, 'n at u r a l l an g u a g e</w>': 1, 'p r o c e s s ing ,</w>': 1, 'u s e</w>': 2, 'a </w>': 1, 'd i v e r s e</w>': 1, 'r an g e</w>': 1, 'a p p l i c ation s</w>': 1, 'in c l u d ing</w>': 1, 't e x t </w>': 1, 'c l a s s i f i c ation ,</w>': 1, 'm a c h in e</w>': 1, 't r an s l ation ,</w>': 1, 'an d</w>': 3, 'q u e st ion </w>': 1, 'an s w e r ing . </w>': 1, 'T o p i c s</w>': 1, 'b e</w>': 1, 'c o v e r e d</w>': 1, 'in c l u d e</w>': 1, 'p a r t - o f - s p e e c h </w>': 1, 't a g g ing ,</w>': 1, 'n - g r a m </w>': 1, 'l an g u a g e</w>': 2, 'm o de l l ing ,</w>': 1, 's y n t a c t i c </w>': 1, 'p a r s ing</w>': 1, 'de e p </w>': 1, 'l e a r n ing . </w>': 1, 'p r o g r a m m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h o p s ,</w>': 1, 'a s s i g n m e n t s</w>': 1, 'in st a l l ation </w>': 1, 'at </w>': 1, 'h o m e . </w>': 1}\n",
      "Iter: 19\n",
      "Best pair: ('u', 's')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 3, 'h': 6, 'e': 18, '</w>': 21, 'a': 20, 'i': 13, 'm': 12, 's': 16, 'f': 4, 'o': 13, 'r': 14, 't': 13, 'u': 10, 'b': 2, 'j': 1, 'c': 13, 'd': 4, 'n': 8, 'v': 3, 'l': 16, 'p': 11, 'g': 13, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 3, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 5, 'e</w>': 16, 'an': 11, 's</w>': 10, 'ing': 5, 'or': 3, 'on': 2, 'at': 2, ',</w>': 7, 'd</w>': 6, 'ion': 1, 'for': 1, 'th': 5, 'de': 5, 'ation': 5, 'for</w>': 4, 'st': 4, 'ing</w>': 4, 'in</w>': 4, 'us': 4})\n",
      "Number of tokens: 51\n",
      "==========\n",
      "vocab,  {'T h e</w>': 2, 'a i m s</w>': 1, 'for</w>': 4, 'th i s</w>': 1, 's u b j e c t </w>': 1, 'i s</w>': 2, 'st u de n t s</w>': 1, 't o </w>': 2, 'de v e l o p </w>': 1, 'an </w>': 1, 'u n de r st an d ing</w>': 1, 'o f </w>': 2, 'th e</w>': 2, 'm a in</w>': 1, 'a l g or i th m s</w>': 1, 'us e d</w>': 2, 'in</w>': 3, 'n at u r a l l an g u a g e</w>': 1, 'p r o c e s s ing ,</w>': 1, 'us e</w>': 2, 'a </w>': 1, 'd i v e r s e</w>': 1, 'r an g e</w>': 1, 'a p p l i c ation s</w>': 1, 'in c l u d ing</w>': 1, 't e x t </w>': 1, 'c l a s s i f i c ation ,</w>': 1, 'm a c h in e</w>': 1, 't r an s l ation ,</w>': 1, 'an d</w>': 3, 'q u e st ion </w>': 1, 'an s w e r ing . </w>': 1, 'T o p i c s</w>': 1, 'b e</w>': 1, 'c o v e r e d</w>': 1, 'in c l u d e</w>': 1, 'p a r t - o f - s p e e c h </w>': 1, 't a g g ing ,</w>': 1, 'n - g r a m </w>': 1, 'l an g u a g e</w>': 2, 'm o de l l ing ,</w>': 1, 's y n t a c t i c </w>': 1, 'p a r s ing</w>': 1, 'de e p </w>': 1, 'l e a r n ing . </w>': 1, 'p r o g r a m m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h o p s ,</w>': 1, 'a s s i g n m e n t s</w>': 1, 'in st a l l ation </w>': 1, 'at </w>': 1, 'h o m e . </w>': 1}\n",
      "Iter: 20\n",
      "Best pair: ('an', 'g')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 3, 'h': 6, 'e': 18, '</w>': 21, 'a': 20, 'i': 13, 'm': 12, 's': 16, 'f': 4, 'o': 13, 'r': 14, 't': 13, 'u': 10, 'b': 2, 'j': 1, 'c': 13, 'd': 4, 'n': 8, 'v': 3, 'l': 16, 'p': 11, 'g': 9, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 3, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 5, 'e</w>': 16, 'an': 7, 's</w>': 10, 'ing': 5, 'or': 3, 'on': 2, 'at': 2, ',</w>': 7, 'd</w>': 6, 'ion': 1, 'for': 1, 'th': 5, 'de': 5, 'ation': 5, 'for</w>': 4, 'st': 4, 'ing</w>': 4, 'in</w>': 4, 'us': 4, 'ang': 4})\n",
      "Number of tokens: 52\n",
      "==========\n",
      "vocab,  {'T h e</w>': 2, 'a i m s</w>': 1, 'for</w>': 4, 'th i s</w>': 1, 's u b j e c t </w>': 1, 'i s</w>': 2, 'st u de n t s</w>': 1, 't o </w>': 2, 'de v e l o p </w>': 1, 'an </w>': 1, 'u n de r st an d ing</w>': 1, 'o f </w>': 2, 'th e</w>': 2, 'm a in</w>': 1, 'a l g or i th m s</w>': 1, 'us e d</w>': 2, 'in</w>': 3, 'n at u r a l l ang u a g e</w>': 1, 'p r o c e s s ing ,</w>': 1, 'us e</w>': 2, 'a </w>': 1, 'd i v e r s e</w>': 1, 'r ang e</w>': 1, 'a p p l i c ation s</w>': 1, 'in c l u d ing</w>': 1, 't e x t </w>': 1, 'c l a s s i f i c ation ,</w>': 1, 'm a c h in e</w>': 1, 't r an s l ation ,</w>': 1, 'an d</w>': 3, 'q u e st ion </w>': 1, 'an s w e r ing . </w>': 1, 'T o p i c s</w>': 1, 'b e</w>': 1, 'c o v e r e d</w>': 1, 'in c l u d e</w>': 1, 'p a r t - o f - s p e e c h </w>': 1, 't a g g ing ,</w>': 1, 'n - g r a m </w>': 1, 'l ang u a g e</w>': 2, 'm o de l l ing ,</w>': 1, 's y n t a c t i c </w>': 1, 'p a r s ing</w>': 1, 'de e p </w>': 1, 'l e a r n ing . </w>': 1, 'p r o g r a m m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h o p s ,</w>': 1, 'a s s i g n m e n t s</w>': 1, 'in st a l l ation </w>': 1, 'at </w>': 1, 'h o m e . </w>': 1}\n",
      "Iter: 21\n",
      "Best pair: ('a', 'g')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 3, 'h': 6, 'e': 18, '</w>': 21, 'a': 16, 'i': 13, 'm': 12, 's': 16, 'f': 4, 'o': 13, 'r': 14, 't': 13, 'u': 10, 'b': 2, 'j': 1, 'c': 13, 'd': 4, 'n': 8, 'v': 3, 'l': 16, 'p': 11, 'g': 5, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 3, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 5, 'e</w>': 16, 'an': 7, 's</w>': 10, 'ing': 5, 'or': 3, 'on': 2, 'at': 2, ',</w>': 7, 'd</w>': 6, 'ion': 1, 'for': 1, 'th': 5, 'de': 5, 'ation': 5, 'for</w>': 4, 'st': 4, 'ing</w>': 4, 'in</w>': 4, 'us': 4, 'ang': 4, 'ag': 4})\n",
      "Number of tokens: 53\n",
      "==========\n",
      "vocab,  {'T h e</w>': 2, 'a i m s</w>': 1, 'for</w>': 4, 'th i s</w>': 1, 's u b j e c t </w>': 1, 'i s</w>': 2, 'st u de n t s</w>': 1, 't o </w>': 2, 'de v e l o p </w>': 1, 'an </w>': 1, 'u n de r st an d ing</w>': 1, 'o f </w>': 2, 'th e</w>': 2, 'm a in</w>': 1, 'a l g or i th m s</w>': 1, 'us e d</w>': 2, 'in</w>': 3, 'n at u r a l l ang u ag e</w>': 1, 'p r o c e s s ing ,</w>': 1, 'us e</w>': 2, 'a </w>': 1, 'd i v e r s e</w>': 1, 'r ang e</w>': 1, 'a p p l i c ation s</w>': 1, 'in c l u d ing</w>': 1, 't e x t </w>': 1, 'c l a s s i f i c ation ,</w>': 1, 'm a c h in e</w>': 1, 't r an s l ation ,</w>': 1, 'an d</w>': 3, 'q u e st ion </w>': 1, 'an s w e r ing . </w>': 1, 'T o p i c s</w>': 1, 'b e</w>': 1, 'c o v e r e d</w>': 1, 'in c l u d e</w>': 1, 'p a r t - o f - s p e e c h </w>': 1, 't ag g ing ,</w>': 1, 'n - g r a m </w>': 1, 'l ang u ag e</w>': 2, 'm o de l l ing ,</w>': 1, 's y n t a c t i c </w>': 1, 'p a r s ing</w>': 1, 'de e p </w>': 1, 'l e a r n ing . </w>': 1, 'p r o g r a m m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h o p s ,</w>': 1, 'a s s i g n m e n t s</w>': 1, 'in st a l l ation </w>': 1, 'at </w>': 1, 'h o m e . </w>': 1}\n",
      "Iter: 22\n",
      "Best pair: ('i', 'c')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 3, 'h': 6, 'e': 18, '</w>': 21, 'a': 16, 'i': 9, 'm': 12, 's': 16, 'f': 4, 'o': 13, 'r': 14, 't': 13, 'u': 10, 'b': 2, 'j': 1, 'c': 9, 'd': 4, 'n': 8, 'v': 3, 'l': 16, 'p': 11, 'g': 5, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 3, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 5, 'e</w>': 16, 'an': 7, 's</w>': 10, 'ing': 5, 'or': 3, 'on': 2, 'at': 2, ',</w>': 7, 'd</w>': 6, 'ion': 1, 'for': 1, 'th': 5, 'de': 5, 'ation': 5, 'for</w>': 4, 'st': 4, 'ing</w>': 4, 'in</w>': 4, 'us': 4, 'ang': 4, 'ag': 4, 'ic': 4})\n",
      "Number of tokens: 54\n",
      "==========\n",
      "vocab,  {'T h e</w>': 2, 'a i m s</w>': 1, 'for</w>': 4, 'th i s</w>': 1, 's u b j e c t </w>': 1, 'i s</w>': 2, 'st u de n t s</w>': 1, 't o </w>': 2, 'de v e l o p </w>': 1, 'an </w>': 1, 'u n de r st an d ing</w>': 1, 'o f </w>': 2, 'th e</w>': 2, 'm a in</w>': 1, 'a l g or i th m s</w>': 1, 'us e d</w>': 2, 'in</w>': 3, 'n at u r a l l ang u ag e</w>': 1, 'p r o c e s s ing ,</w>': 1, 'us e</w>': 2, 'a </w>': 1, 'd i v e r s e</w>': 1, 'r ang e</w>': 1, 'a p p l ic ation s</w>': 1, 'in c l u d ing</w>': 1, 't e x t </w>': 1, 'c l a s s i f ic ation ,</w>': 1, 'm a c h in e</w>': 1, 't r an s l ation ,</w>': 1, 'an d</w>': 3, 'q u e st ion </w>': 1, 'an s w e r ing . </w>': 1, 'T o p ic s</w>': 1, 'b e</w>': 1, 'c o v e r e d</w>': 1, 'in c l u d e</w>': 1, 'p a r t - o f - s p e e c h </w>': 1, 't ag g ing ,</w>': 1, 'n - g r a m </w>': 1, 'l ang u ag e</w>': 2, 'm o de l l ing ,</w>': 1, 's y n t a c t ic </w>': 1, 'p a r s ing</w>': 1, 'de e p </w>': 1, 'l e a r n ing . </w>': 1, 'p r o g r a m m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h o p s ,</w>': 1, 'a s s i g n m e n t s</w>': 1, 'in st a l l ation </w>': 1, 'at </w>': 1, 'h o m e . </w>': 1}\n",
      "Iter: 23\n",
      "Best pair: ('i', 's</w>')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 3, 'h': 6, 'e': 18, '</w>': 21, 'a': 16, 'i': 6, 'm': 12, 's': 16, 'f': 4, 'o': 13, 'r': 14, 't': 13, 'u': 10, 'b': 2, 'j': 1, 'c': 9, 'd': 4, 'n': 8, 'v': 3, 'l': 16, 'p': 11, 'g': 5, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 3, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 5, 'e</w>': 16, 'an': 7, 's</w>': 7, 'ing': 5, 'or': 3, 'on': 2, 'at': 2, ',</w>': 7, 'd</w>': 6, 'ion': 1, 'for': 1, 'th': 5, 'de': 5, 'ation': 5, 'for</w>': 4, 'st': 4, 'ing</w>': 4, 'in</w>': 4, 'us': 4, 'ang': 4, 'ag': 4, 'ic': 4, 'is</w>': 3})\n",
      "Number of tokens: 55\n",
      "==========\n",
      "vocab,  {'T h e</w>': 2, 'a i m s</w>': 1, 'for</w>': 4, 'th is</w>': 1, 's u b j e c t </w>': 1, 'is</w>': 2, 'st u de n t s</w>': 1, 't o </w>': 2, 'de v e l o p </w>': 1, 'an </w>': 1, 'u n de r st an d ing</w>': 1, 'o f </w>': 2, 'th e</w>': 2, 'm a in</w>': 1, 'a l g or i th m s</w>': 1, 'us e d</w>': 2, 'in</w>': 3, 'n at u r a l l ang u ag e</w>': 1, 'p r o c e s s ing ,</w>': 1, 'us e</w>': 2, 'a </w>': 1, 'd i v e r s e</w>': 1, 'r ang e</w>': 1, 'a p p l ic ation s</w>': 1, 'in c l u d ing</w>': 1, 't e x t </w>': 1, 'c l a s s i f ic ation ,</w>': 1, 'm a c h in e</w>': 1, 't r an s l ation ,</w>': 1, 'an d</w>': 3, 'q u e st ion </w>': 1, 'an s w e r ing . </w>': 1, 'T o p ic s</w>': 1, 'b e</w>': 1, 'c o v e r e d</w>': 1, 'in c l u d e</w>': 1, 'p a r t - o f - s p e e c h </w>': 1, 't ag g ing ,</w>': 1, 'n - g r a m </w>': 1, 'l ang u ag e</w>': 2, 'm o de l l ing ,</w>': 1, 's y n t a c t ic </w>': 1, 'p a r s ing</w>': 1, 'de e p </w>': 1, 'l e a r n ing . </w>': 1, 'p r o g r a m m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h o p s ,</w>': 1, 'a s s i g n m e n t s</w>': 1, 'in st a l l ation </w>': 1, 'at </w>': 1, 'h o m e . </w>': 1}\n",
      "Iter: 24\n",
      "Best pair: ('n', 't')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 3, 'h': 6, 'e': 18, '</w>': 21, 'a': 16, 'i': 6, 'm': 12, 's': 16, 'f': 4, 'o': 13, 'r': 14, 't': 10, 'u': 10, 'b': 2, 'j': 1, 'c': 9, 'd': 4, 'n': 5, 'v': 3, 'l': 16, 'p': 11, 'g': 5, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 3, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 5, 'e</w>': 16, 'an': 7, 's</w>': 7, 'ing': 5, 'or': 3, 'on': 2, 'at': 2, ',</w>': 7, 'd</w>': 6, 'ion': 1, 'for': 1, 'th': 5, 'de': 5, 'ation': 5, 'for</w>': 4, 'st': 4, 'ing</w>': 4, 'in</w>': 4, 'us': 4, 'ang': 4, 'ag': 4, 'ic': 4, 'is</w>': 3, 'nt': 3})\n",
      "Number of tokens: 56\n",
      "==========\n",
      "vocab,  {'T h e</w>': 2, 'a i m s</w>': 1, 'for</w>': 4, 'th is</w>': 1, 's u b j e c t </w>': 1, 'is</w>': 2, 'st u de nt s</w>': 1, 't o </w>': 2, 'de v e l o p </w>': 1, 'an </w>': 1, 'u n de r st an d ing</w>': 1, 'o f </w>': 2, 'th e</w>': 2, 'm a in</w>': 1, 'a l g or i th m s</w>': 1, 'us e d</w>': 2, 'in</w>': 3, 'n at u r a l l ang u ag e</w>': 1, 'p r o c e s s ing ,</w>': 1, 'us e</w>': 2, 'a </w>': 1, 'd i v e r s e</w>': 1, 'r ang e</w>': 1, 'a p p l ic ation s</w>': 1, 'in c l u d ing</w>': 1, 't e x t </w>': 1, 'c l a s s i f ic ation ,</w>': 1, 'm a c h in e</w>': 1, 't r an s l ation ,</w>': 1, 'an d</w>': 3, 'q u e st ion </w>': 1, 'an s w e r ing . </w>': 1, 'T o p ic s</w>': 1, 'b e</w>': 1, 'c o v e r e d</w>': 1, 'in c l u d e</w>': 1, 'p a r t - o f - s p e e c h </w>': 1, 't ag g ing ,</w>': 1, 'n - g r a m </w>': 1, 'l ang u ag e</w>': 2, 'm o de l l ing ,</w>': 1, 's y nt a c t ic </w>': 1, 'p a r s ing</w>': 1, 'de e p </w>': 1, 'l e a r n ing . </w>': 1, 'p r o g r a m m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h o p s ,</w>': 1, 'a s s i g n m e nt s</w>': 1, 'in st a l l ation </w>': 1, 'at </w>': 1, 'h o m e . </w>': 1}\n",
      "Iter: 25\n",
      "Best pair: ('v', 'e')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 3, 'h': 6, 'e': 15, '</w>': 21, 'a': 16, 'i': 6, 'm': 12, 's': 16, 'f': 4, 'o': 13, 'r': 14, 't': 10, 'u': 10, 'b': 2, 'j': 1, 'c': 9, 'd': 4, 'n': 5, 'v': 0, 'l': 16, 'p': 11, 'g': 5, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 3, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 5, 'e</w>': 16, 'an': 7, 's</w>': 7, 'ing': 5, 'or': 3, 'on': 2, 'at': 2, ',</w>': 7, 'd</w>': 6, 'ion': 1, 'for': 1, 'th': 5, 'de': 5, 'ation': 5, 'for</w>': 4, 'st': 4, 'ing</w>': 4, 'in</w>': 4, 'us': 4, 'ang': 4, 'ag': 4, 'ic': 4, 'is</w>': 3, 'nt': 3, 've': 3})\n",
      "Number of tokens: 57\n",
      "==========\n",
      "vocab,  {'T h e</w>': 2, 'a i m s</w>': 1, 'for</w>': 4, 'th is</w>': 1, 's u b j e c t </w>': 1, 'is</w>': 2, 'st u de nt s</w>': 1, 't o </w>': 2, 'de ve l o p </w>': 1, 'an </w>': 1, 'u n de r st an d ing</w>': 1, 'o f </w>': 2, 'th e</w>': 2, 'm a in</w>': 1, 'a l g or i th m s</w>': 1, 'us e d</w>': 2, 'in</w>': 3, 'n at u r a l l ang u ag e</w>': 1, 'p r o c e s s ing ,</w>': 1, 'us e</w>': 2, 'a </w>': 1, 'd i ve r s e</w>': 1, 'r ang e</w>': 1, 'a p p l ic ation s</w>': 1, 'in c l u d ing</w>': 1, 't e x t </w>': 1, 'c l a s s i f ic ation ,</w>': 1, 'm a c h in e</w>': 1, 't r an s l ation ,</w>': 1, 'an d</w>': 3, 'q u e st ion </w>': 1, 'an s w e r ing . </w>': 1, 'T o p ic s</w>': 1, 'b e</w>': 1, 'c o ve r e d</w>': 1, 'in c l u d e</w>': 1, 'p a r t - o f - s p e e c h </w>': 1, 't ag g ing ,</w>': 1, 'n - g r a m </w>': 1, 'l ang u ag e</w>': 2, 'm o de l l ing ,</w>': 1, 's y nt a c t ic </w>': 1, 'p a r s ing</w>': 1, 'de e p </w>': 1, 'l e a r n ing . </w>': 1, 'p r o g r a m m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h o p s ,</w>': 1, 'a s s i g n m e nt s</w>': 1, 'in st a l l ation </w>': 1, 'at </w>': 1, 'h o m e . </w>': 1}\n",
      "Iter: 26\n",
      "Best pair: ('o', 'p')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 3, 'h': 6, 'e': 15, '</w>': 21, 'a': 16, 'i': 6, 'm': 12, 's': 16, 'f': 4, 'o': 10, 'r': 14, 't': 10, 'u': 10, 'b': 2, 'j': 1, 'c': 9, 'd': 4, 'n': 5, 'v': 0, 'l': 16, 'p': 8, 'g': 5, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 3, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 5, 'e</w>': 16, 'an': 7, 's</w>': 7, 'ing': 5, 'or': 3, 'on': 2, 'at': 2, ',</w>': 7, 'd</w>': 6, 'ion': 1, 'for': 1, 'th': 5, 'de': 5, 'ation': 5, 'for</w>': 4, 'st': 4, 'ing</w>': 4, 'in</w>': 4, 'us': 4, 'ang': 4, 'ag': 4, 'ic': 4, 'is</w>': 3, 'nt': 3, 've': 3, 'op': 3})\n",
      "Number of tokens: 58\n",
      "==========\n",
      "vocab,  {'T h e</w>': 2, 'a i m s</w>': 1, 'for</w>': 4, 'th is</w>': 1, 's u b j e c t </w>': 1, 'is</w>': 2, 'st u de nt s</w>': 1, 't o </w>': 2, 'de ve l op </w>': 1, 'an </w>': 1, 'u n de r st an d ing</w>': 1, 'o f </w>': 2, 'th e</w>': 2, 'm a in</w>': 1, 'a l g or i th m s</w>': 1, 'us e d</w>': 2, 'in</w>': 3, 'n at u r a l l ang u ag e</w>': 1, 'p r o c e s s ing ,</w>': 1, 'us e</w>': 2, 'a </w>': 1, 'd i ve r s e</w>': 1, 'r ang e</w>': 1, 'a p p l ic ation s</w>': 1, 'in c l u d ing</w>': 1, 't e x t </w>': 1, 'c l a s s i f ic ation ,</w>': 1, 'm a c h in e</w>': 1, 't r an s l ation ,</w>': 1, 'an d</w>': 3, 'q u e st ion </w>': 1, 'an s w e r ing . </w>': 1, 'T op ic s</w>': 1, 'b e</w>': 1, 'c o ve r e d</w>': 1, 'in c l u d e</w>': 1, 'p a r t - o f - s p e e c h </w>': 1, 't ag g ing ,</w>': 1, 'n - g r a m </w>': 1, 'l ang u ag e</w>': 2, 'm o de l l ing ,</w>': 1, 's y nt a c t ic </w>': 1, 'p a r s ing</w>': 1, 'de e p </w>': 1, 'l e a r n ing . </w>': 1, 'p r o g r a m m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h op s ,</w>': 1, 'a s s i g n m e nt s</w>': 1, 'in st a l l ation </w>': 1, 'at </w>': 1, 'h o m e . </w>': 1}\n",
      "Iter: 27\n",
      "Best pair: ('o', 'f')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 3, 'h': 6, 'e': 15, '</w>': 21, 'a': 16, 'i': 6, 'm': 12, 's': 16, 'f': 1, 'o': 7, 'r': 14, 't': 10, 'u': 10, 'b': 2, 'j': 1, 'c': 9, 'd': 4, 'n': 5, 'v': 0, 'l': 16, 'p': 8, 'g': 5, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 3, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 5, 'e</w>': 16, 'an': 7, 's</w>': 7, 'ing': 5, 'or': 3, 'on': 2, 'at': 2, ',</w>': 7, 'd</w>': 6, 'ion': 1, 'for': 1, 'th': 5, 'de': 5, 'ation': 5, 'for</w>': 4, 'st': 4, 'ing</w>': 4, 'in</w>': 4, 'us': 4, 'ang': 4, 'ag': 4, 'ic': 4, 'is</w>': 3, 'nt': 3, 've': 3, 'op': 3, 'of': 3})\n",
      "Number of tokens: 59\n",
      "==========\n",
      "vocab,  {'T h e</w>': 2, 'a i m s</w>': 1, 'for</w>': 4, 'th is</w>': 1, 's u b j e c t </w>': 1, 'is</w>': 2, 'st u de nt s</w>': 1, 't o </w>': 2, 'de ve l op </w>': 1, 'an </w>': 1, 'u n de r st an d ing</w>': 1, 'of </w>': 2, 'th e</w>': 2, 'm a in</w>': 1, 'a l g or i th m s</w>': 1, 'us e d</w>': 2, 'in</w>': 3, 'n at u r a l l ang u ag e</w>': 1, 'p r o c e s s ing ,</w>': 1, 'us e</w>': 2, 'a </w>': 1, 'd i ve r s e</w>': 1, 'r ang e</w>': 1, 'a p p l ic ation s</w>': 1, 'in c l u d ing</w>': 1, 't e x t </w>': 1, 'c l a s s i f ic ation ,</w>': 1, 'm a c h in e</w>': 1, 't r an s l ation ,</w>': 1, 'an d</w>': 3, 'q u e st ion </w>': 1, 'an s w e r ing . </w>': 1, 'T op ic s</w>': 1, 'b e</w>': 1, 'c o ve r e d</w>': 1, 'in c l u d e</w>': 1, 'p a r t - of - s p e e c h </w>': 1, 't ag g ing ,</w>': 1, 'n - g r a m </w>': 1, 'l ang u ag e</w>': 2, 'm o de l l ing ,</w>': 1, 's y nt a c t ic </w>': 1, 'p a r s ing</w>': 1, 'de e p </w>': 1, 'l e a r n ing . </w>': 1, 'p r o g r a m m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h op s ,</w>': 1, 'a s s i g n m e nt s</w>': 1, 'in st a l l ation </w>': 1, 'at </w>': 1, 'h o m e . </w>': 1}\n",
      "Iter: 28\n",
      "Best pair: ('a', 'l')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 3, 'h': 6, 'e': 15, '</w>': 21, 'a': 13, 'i': 6, 'm': 12, 's': 16, 'f': 1, 'o': 7, 'r': 14, 't': 10, 'u': 10, 'b': 2, 'j': 1, 'c': 9, 'd': 4, 'n': 5, 'v': 0, 'l': 13, 'p': 8, 'g': 5, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 3, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 5, 'e</w>': 16, 'an': 7, 's</w>': 7, 'ing': 5, 'or': 3, 'on': 2, 'at': 2, ',</w>': 7, 'd</w>': 6, 'ion': 1, 'for': 1, 'th': 5, 'de': 5, 'ation': 5, 'for</w>': 4, 'st': 4, 'ing</w>': 4, 'in</w>': 4, 'us': 4, 'ang': 4, 'ag': 4, 'ic': 4, 'is</w>': 3, 'nt': 3, 've': 3, 'op': 3, 'of': 3, 'al': 3})\n",
      "Number of tokens: 60\n",
      "==========\n",
      "vocab,  {'T h e</w>': 2, 'a i m s</w>': 1, 'for</w>': 4, 'th is</w>': 1, 's u b j e c t </w>': 1, 'is</w>': 2, 'st u de nt s</w>': 1, 't o </w>': 2, 'de ve l op </w>': 1, 'an </w>': 1, 'u n de r st an d ing</w>': 1, 'of </w>': 2, 'th e</w>': 2, 'm a in</w>': 1, 'al g or i th m s</w>': 1, 'us e d</w>': 2, 'in</w>': 3, 'n at u r al l ang u ag e</w>': 1, 'p r o c e s s ing ,</w>': 1, 'us e</w>': 2, 'a </w>': 1, 'd i ve r s e</w>': 1, 'r ang e</w>': 1, 'a p p l ic ation s</w>': 1, 'in c l u d ing</w>': 1, 't e x t </w>': 1, 'c l a s s i f ic ation ,</w>': 1, 'm a c h in e</w>': 1, 't r an s l ation ,</w>': 1, 'an d</w>': 3, 'q u e st ion </w>': 1, 'an s w e r ing . </w>': 1, 'T op ic s</w>': 1, 'b e</w>': 1, 'c o ve r e d</w>': 1, 'in c l u d e</w>': 1, 'p a r t - of - s p e e c h </w>': 1, 't ag g ing ,</w>': 1, 'n - g r a m </w>': 1, 'l ang u ag e</w>': 2, 'm o de l l ing ,</w>': 1, 's y nt a c t ic </w>': 1, 'p a r s ing</w>': 1, 'de e p </w>': 1, 'l e a r n ing . </w>': 1, 'p r o g r a m m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h op s ,</w>': 1, 'a s s i g n m e nt s</w>': 1, 'in st al l ation </w>': 1, 'at </w>': 1, 'h o m e . </w>': 1}\n",
      "Iter: 29\n",
      "Best pair: ('e', 'd</w>')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 3, 'h': 6, 'e': 12, '</w>': 21, 'a': 13, 'i': 6, 'm': 12, 's': 16, 'f': 1, 'o': 7, 'r': 14, 't': 10, 'u': 10, 'b': 2, 'j': 1, 'c': 9, 'd': 4, 'n': 5, 'v': 0, 'l': 13, 'p': 8, 'g': 5, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 3, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 5, 'e</w>': 16, 'an': 7, 's</w>': 7, 'ing': 5, 'or': 3, 'on': 2, 'at': 2, ',</w>': 7, 'd</w>': 3, 'ion': 1, 'for': 1, 'th': 5, 'de': 5, 'ation': 5, 'for</w>': 4, 'st': 4, 'ing</w>': 4, 'in</w>': 4, 'us': 4, 'ang': 4, 'ag': 4, 'ic': 4, 'is</w>': 3, 'nt': 3, 've': 3, 'op': 3, 'of': 3, 'al': 3, 'ed</w>': 3})\n",
      "Number of tokens: 61\n",
      "==========\n",
      "vocab,  {'T h e</w>': 2, 'a i m s</w>': 1, 'for</w>': 4, 'th is</w>': 1, 's u b j e c t </w>': 1, 'is</w>': 2, 'st u de nt s</w>': 1, 't o </w>': 2, 'de ve l op </w>': 1, 'an </w>': 1, 'u n de r st an d ing</w>': 1, 'of </w>': 2, 'th e</w>': 2, 'm a in</w>': 1, 'al g or i th m s</w>': 1, 'us ed</w>': 2, 'in</w>': 3, 'n at u r al l ang u ag e</w>': 1, 'p r o c e s s ing ,</w>': 1, 'us e</w>': 2, 'a </w>': 1, 'd i ve r s e</w>': 1, 'r ang e</w>': 1, 'a p p l ic ation s</w>': 1, 'in c l u d ing</w>': 1, 't e x t </w>': 1, 'c l a s s i f ic ation ,</w>': 1, 'm a c h in e</w>': 1, 't r an s l ation ,</w>': 1, 'an d</w>': 3, 'q u e st ion </w>': 1, 'an s w e r ing . </w>': 1, 'T op ic s</w>': 1, 'b e</w>': 1, 'c o ve r ed</w>': 1, 'in c l u d e</w>': 1, 'p a r t - of - s p e e c h </w>': 1, 't ag g ing ,</w>': 1, 'n - g r a m </w>': 1, 'l ang u ag e</w>': 2, 'm o de l l ing ,</w>': 1, 's y nt a c t ic </w>': 1, 'p a r s ing</w>': 1, 'de e p </w>': 1, 'l e a r n ing . </w>': 1, 'p r o g r a m m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h op s ,</w>': 1, 'a s s i g n m e nt s</w>': 1, 'in st al l ation </w>': 1, 'at </w>': 1, 'h o m e . </w>': 1}\n",
      "Iter: 30\n",
      "Best pair: ('l', 'ang')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 3, 'h': 6, 'e': 12, '</w>': 21, 'a': 13, 'i': 6, 'm': 12, 's': 16, 'f': 1, 'o': 7, 'r': 14, 't': 10, 'u': 10, 'b': 2, 'j': 1, 'c': 9, 'd': 4, 'n': 5, 'v': 0, 'l': 10, 'p': 8, 'g': 5, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 3, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 5, 'e</w>': 16, 'an': 7, 's</w>': 7, 'ing': 5, 'or': 3, 'on': 2, 'at': 2, ',</w>': 7, 'd</w>': 3, 'ion': 1, 'for': 1, 'th': 5, 'de': 5, 'ation': 5, 'for</w>': 4, 'st': 4, 'ing</w>': 4, 'in</w>': 4, 'us': 4, 'ang': 1, 'ag': 4, 'ic': 4, 'is</w>': 3, 'nt': 3, 've': 3, 'op': 3, 'of': 3, 'al': 3, 'ed</w>': 3, 'lang': 3})\n",
      "Number of tokens: 62\n",
      "==========\n",
      "vocab,  {'T h e</w>': 2, 'a i m s</w>': 1, 'for</w>': 4, 'th is</w>': 1, 's u b j e c t </w>': 1, 'is</w>': 2, 'st u de nt s</w>': 1, 't o </w>': 2, 'de ve l op </w>': 1, 'an </w>': 1, 'u n de r st an d ing</w>': 1, 'of </w>': 2, 'th e</w>': 2, 'm a in</w>': 1, 'al g or i th m s</w>': 1, 'us ed</w>': 2, 'in</w>': 3, 'n at u r al lang u ag e</w>': 1, 'p r o c e s s ing ,</w>': 1, 'us e</w>': 2, 'a </w>': 1, 'd i ve r s e</w>': 1, 'r ang e</w>': 1, 'a p p l ic ation s</w>': 1, 'in c l u d ing</w>': 1, 't e x t </w>': 1, 'c l a s s i f ic ation ,</w>': 1, 'm a c h in e</w>': 1, 't r an s l ation ,</w>': 1, 'an d</w>': 3, 'q u e st ion </w>': 1, 'an s w e r ing . </w>': 1, 'T op ic s</w>': 1, 'b e</w>': 1, 'c o ve r ed</w>': 1, 'in c l u d e</w>': 1, 'p a r t - of - s p e e c h </w>': 1, 't ag g ing ,</w>': 1, 'n - g r a m </w>': 1, 'lang u ag e</w>': 2, 'm o de l l ing ,</w>': 1, 's y nt a c t ic </w>': 1, 'p a r s ing</w>': 1, 'de e p </w>': 1, 'l e a r n ing . </w>': 1, 'p r o g r a m m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h op s ,</w>': 1, 'a s s i g n m e nt s</w>': 1, 'in st al l ation </w>': 1, 'at </w>': 1, 'h o m e . </w>': 1}\n",
      "Iter: 31\n",
      "Best pair: ('lang', 'u')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 3, 'h': 6, 'e': 12, '</w>': 21, 'a': 13, 'i': 6, 'm': 12, 's': 16, 'f': 1, 'o': 7, 'r': 14, 't': 10, 'u': 7, 'b': 2, 'j': 1, 'c': 9, 'd': 4, 'n': 5, 'v': 0, 'l': 10, 'p': 8, 'g': 5, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 3, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 5, 'e</w>': 16, 'an': 7, 's</w>': 7, 'ing': 5, 'or': 3, 'on': 2, 'at': 2, ',</w>': 7, 'd</w>': 3, 'ion': 1, 'for': 1, 'th': 5, 'de': 5, 'ation': 5, 'for</w>': 4, 'st': 4, 'ing</w>': 4, 'in</w>': 4, 'us': 4, 'ang': 1, 'ag': 4, 'ic': 4, 'is</w>': 3, 'nt': 3, 've': 3, 'op': 3, 'of': 3, 'al': 3, 'ed</w>': 3, 'lang': 0, 'langu': 3})\n",
      "Number of tokens: 63\n",
      "==========\n",
      "vocab,  {'T h e</w>': 2, 'a i m s</w>': 1, 'for</w>': 4, 'th is</w>': 1, 's u b j e c t </w>': 1, 'is</w>': 2, 'st u de nt s</w>': 1, 't o </w>': 2, 'de ve l op </w>': 1, 'an </w>': 1, 'u n de r st an d ing</w>': 1, 'of </w>': 2, 'th e</w>': 2, 'm a in</w>': 1, 'al g or i th m s</w>': 1, 'us ed</w>': 2, 'in</w>': 3, 'n at u r al langu ag e</w>': 1, 'p r o c e s s ing ,</w>': 1, 'us e</w>': 2, 'a </w>': 1, 'd i ve r s e</w>': 1, 'r ang e</w>': 1, 'a p p l ic ation s</w>': 1, 'in c l u d ing</w>': 1, 't e x t </w>': 1, 'c l a s s i f ic ation ,</w>': 1, 'm a c h in e</w>': 1, 't r an s l ation ,</w>': 1, 'an d</w>': 3, 'q u e st ion </w>': 1, 'an s w e r ing . </w>': 1, 'T op ic s</w>': 1, 'b e</w>': 1, 'c o ve r ed</w>': 1, 'in c l u d e</w>': 1, 'p a r t - of - s p e e c h </w>': 1, 't ag g ing ,</w>': 1, 'n - g r a m </w>': 1, 'langu ag e</w>': 2, 'm o de l l ing ,</w>': 1, 's y nt a c t ic </w>': 1, 'p a r s ing</w>': 1, 'de e p </w>': 1, 'l e a r n ing . </w>': 1, 'p r o g r a m m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h op s ,</w>': 1, 'a s s i g n m e nt s</w>': 1, 'in st al l ation </w>': 1, 'at </w>': 1, 'h o m e . </w>': 1}\n",
      "Iter: 32\n",
      "Best pair: ('langu', 'ag')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 3, 'h': 6, 'e': 12, '</w>': 21, 'a': 13, 'i': 6, 'm': 12, 's': 16, 'f': 1, 'o': 7, 'r': 14, 't': 10, 'u': 7, 'b': 2, 'j': 1, 'c': 9, 'd': 4, 'n': 5, 'v': 0, 'l': 10, 'p': 8, 'g': 5, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 3, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 5, 'e</w>': 16, 'an': 7, 's</w>': 7, 'ing': 5, 'or': 3, 'on': 2, 'at': 2, ',</w>': 7, 'd</w>': 3, 'ion': 1, 'for': 1, 'th': 5, 'de': 5, 'ation': 5, 'for</w>': 4, 'st': 4, 'ing</w>': 4, 'in</w>': 4, 'us': 4, 'ang': 1, 'ag': 1, 'ic': 4, 'is</w>': 3, 'nt': 3, 've': 3, 'op': 3, 'of': 3, 'al': 3, 'ed</w>': 3, 'lang': 0, 'langu': 0, 'languag': 3})\n",
      "Number of tokens: 64\n",
      "==========\n",
      "vocab,  {'T h e</w>': 2, 'a i m s</w>': 1, 'for</w>': 4, 'th is</w>': 1, 's u b j e c t </w>': 1, 'is</w>': 2, 'st u de nt s</w>': 1, 't o </w>': 2, 'de ve l op </w>': 1, 'an </w>': 1, 'u n de r st an d ing</w>': 1, 'of </w>': 2, 'th e</w>': 2, 'm a in</w>': 1, 'al g or i th m s</w>': 1, 'us ed</w>': 2, 'in</w>': 3, 'n at u r al languag e</w>': 1, 'p r o c e s s ing ,</w>': 1, 'us e</w>': 2, 'a </w>': 1, 'd i ve r s e</w>': 1, 'r ang e</w>': 1, 'a p p l ic ation s</w>': 1, 'in c l u d ing</w>': 1, 't e x t </w>': 1, 'c l a s s i f ic ation ,</w>': 1, 'm a c h in e</w>': 1, 't r an s l ation ,</w>': 1, 'an d</w>': 3, 'q u e st ion </w>': 1, 'an s w e r ing . </w>': 1, 'T op ic s</w>': 1, 'b e</w>': 1, 'c o ve r ed</w>': 1, 'in c l u d e</w>': 1, 'p a r t - of - s p e e c h </w>': 1, 't ag g ing ,</w>': 1, 'n - g r a m </w>': 1, 'languag e</w>': 2, 'm o de l l ing ,</w>': 1, 's y nt a c t ic </w>': 1, 'p a r s ing</w>': 1, 'de e p </w>': 1, 'l e a r n ing . </w>': 1, 'p r o g r a m m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h op s ,</w>': 1, 'a s s i g n m e nt s</w>': 1, 'in st al l ation </w>': 1, 'at </w>': 1, 'h o m e . </w>': 1}\n",
      "Iter: 33\n",
      "Best pair: ('languag', 'e</w>')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 3, 'h': 6, 'e': 12, '</w>': 21, 'a': 13, 'i': 6, 'm': 12, 's': 16, 'f': 1, 'o': 7, 'r': 14, 't': 10, 'u': 7, 'b': 2, 'j': 1, 'c': 9, 'd': 4, 'n': 5, 'v': 0, 'l': 10, 'p': 8, 'g': 5, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 3, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 5, 'e</w>': 13, 'an': 7, 's</w>': 7, 'ing': 5, 'or': 3, 'on': 2, 'at': 2, ',</w>': 7, 'd</w>': 3, 'ion': 1, 'for': 1, 'th': 5, 'de': 5, 'ation': 5, 'for</w>': 4, 'st': 4, 'ing</w>': 4, 'in</w>': 4, 'us': 4, 'ang': 1, 'ag': 1, 'ic': 4, 'is</w>': 3, 'nt': 3, 've': 3, 'op': 3, 'of': 3, 'al': 3, 'ed</w>': 3, 'lang': 0, 'langu': 0, 'languag': 0, 'language</w>': 3})\n",
      "Number of tokens: 65\n",
      "==========\n",
      "vocab,  {'T h e</w>': 2, 'a i m s</w>': 1, 'for</w>': 4, 'th is</w>': 1, 's u b j e c t </w>': 1, 'is</w>': 2, 'st u de nt s</w>': 1, 't o </w>': 2, 'de ve l op </w>': 1, 'an </w>': 1, 'u n de r st an d ing</w>': 1, 'of </w>': 2, 'th e</w>': 2, 'm a in</w>': 1, 'al g or i th m s</w>': 1, 'us ed</w>': 2, 'in</w>': 3, 'n at u r al language</w>': 1, 'p r o c e s s ing ,</w>': 1, 'us e</w>': 2, 'a </w>': 1, 'd i ve r s e</w>': 1, 'r ang e</w>': 1, 'a p p l ic ation s</w>': 1, 'in c l u d ing</w>': 1, 't e x t </w>': 1, 'c l a s s i f ic ation ,</w>': 1, 'm a c h in e</w>': 1, 't r an s l ation ,</w>': 1, 'an d</w>': 3, 'q u e st ion </w>': 1, 'an s w e r ing . </w>': 1, 'T op ic s</w>': 1, 'b e</w>': 1, 'c o ve r ed</w>': 1, 'in c l u d e</w>': 1, 'p a r t - of - s p e e c h </w>': 1, 't ag g ing ,</w>': 1, 'n - g r a m </w>': 1, 'language</w>': 2, 'm o de l l ing ,</w>': 1, 's y nt a c t ic </w>': 1, 'p a r s ing</w>': 1, 'de e p </w>': 1, 'l e a r n ing . </w>': 1, 'p r o g r a m m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h op s ,</w>': 1, 'a s s i g n m e nt s</w>': 1, 'in st al l ation </w>': 1, 'at </w>': 1, 'h o m e . </w>': 1}\n",
      "Iter: 34\n",
      "Best pair: ('s', 's')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 3, 'h': 6, 'e': 12, '</w>': 21, 'a': 13, 'i': 6, 'm': 12, 's': 10, 'f': 1, 'o': 7, 'r': 14, 't': 10, 'u': 7, 'b': 2, 'j': 1, 'c': 9, 'd': 4, 'n': 5, 'v': 0, 'l': 10, 'p': 8, 'g': 5, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 3, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 5, 'e</w>': 13, 'an': 7, 's</w>': 7, 'ing': 5, 'or': 3, 'on': 2, 'at': 2, ',</w>': 7, 'd</w>': 3, 'ion': 1, 'for': 1, 'th': 5, 'de': 5, 'ation': 5, 'for</w>': 4, 'st': 4, 'ing</w>': 4, 'in</w>': 4, 'us': 4, 'ang': 1, 'ag': 1, 'ic': 4, 'is</w>': 3, 'nt': 3, 've': 3, 'op': 3, 'of': 3, 'al': 3, 'ed</w>': 3, 'lang': 0, 'langu': 0, 'languag': 0, 'language</w>': 3, 'ss': 3})\n",
      "Number of tokens: 66\n",
      "==========\n",
      "vocab,  {'T h e</w>': 2, 'a i m s</w>': 1, 'for</w>': 4, 'th is</w>': 1, 's u b j e c t </w>': 1, 'is</w>': 2, 'st u de nt s</w>': 1, 't o </w>': 2, 'de ve l op </w>': 1, 'an </w>': 1, 'u n de r st an d ing</w>': 1, 'of </w>': 2, 'th e</w>': 2, 'm a in</w>': 1, 'al g or i th m s</w>': 1, 'us ed</w>': 2, 'in</w>': 3, 'n at u r al language</w>': 1, 'p r o c e ss ing ,</w>': 1, 'us e</w>': 2, 'a </w>': 1, 'd i ve r s e</w>': 1, 'r ang e</w>': 1, 'a p p l ic ation s</w>': 1, 'in c l u d ing</w>': 1, 't e x t </w>': 1, 'c l a ss i f ic ation ,</w>': 1, 'm a c h in e</w>': 1, 't r an s l ation ,</w>': 1, 'an d</w>': 3, 'q u e st ion </w>': 1, 'an s w e r ing . </w>': 1, 'T op ic s</w>': 1, 'b e</w>': 1, 'c o ve r ed</w>': 1, 'in c l u d e</w>': 1, 'p a r t - of - s p e e c h </w>': 1, 't ag g ing ,</w>': 1, 'n - g r a m </w>': 1, 'language</w>': 2, 'm o de l l ing ,</w>': 1, 's y nt a c t ic </w>': 1, 'p a r s ing</w>': 1, 'de e p </w>': 1, 'l e a r n ing . </w>': 1, 'p r o g r a m m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h op s ,</w>': 1, 'a ss i g n m e nt s</w>': 1, 'in st al l ation </w>': 1, 'at </w>': 1, 'h o m e . </w>': 1}\n",
      "Iter: 35\n",
      "Best pair: ('ing', ',</w>')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 3, 'h': 6, 'e': 12, '</w>': 21, 'a': 13, 'i': 6, 'm': 12, 's': 10, 'f': 1, 'o': 7, 'r': 14, 't': 10, 'u': 7, 'b': 2, 'j': 1, 'c': 9, 'd': 4, 'n': 5, 'v': 0, 'l': 10, 'p': 8, 'g': 5, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 3, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 5, 'e</w>': 13, 'an': 7, 's</w>': 7, 'ing': 2, 'or': 3, 'on': 2, 'at': 2, ',</w>': 4, 'd</w>': 3, 'ion': 1, 'for': 1, 'th': 5, 'de': 5, 'ation': 5, 'for</w>': 4, 'st': 4, 'ing</w>': 4, 'in</w>': 4, 'us': 4, 'ang': 1, 'ag': 1, 'ic': 4, 'is</w>': 3, 'nt': 3, 've': 3, 'op': 3, 'of': 3, 'al': 3, 'ed</w>': 3, 'lang': 0, 'langu': 0, 'languag': 0, 'language</w>': 3, 'ss': 3, 'ing,</w>': 3})\n",
      "Number of tokens: 67\n",
      "==========\n",
      "vocab,  {'T h e</w>': 2, 'a i m s</w>': 1, 'for</w>': 4, 'th is</w>': 1, 's u b j e c t </w>': 1, 'is</w>': 2, 'st u de nt s</w>': 1, 't o </w>': 2, 'de ve l op </w>': 1, 'an </w>': 1, 'u n de r st an d ing</w>': 1, 'of </w>': 2, 'th e</w>': 2, 'm a in</w>': 1, 'al g or i th m s</w>': 1, 'us ed</w>': 2, 'in</w>': 3, 'n at u r al language</w>': 1, 'p r o c e ss ing,</w>': 1, 'us e</w>': 2, 'a </w>': 1, 'd i ve r s e</w>': 1, 'r ang e</w>': 1, 'a p p l ic ation s</w>': 1, 'in c l u d ing</w>': 1, 't e x t </w>': 1, 'c l a ss i f ic ation ,</w>': 1, 'm a c h in e</w>': 1, 't r an s l ation ,</w>': 1, 'an d</w>': 3, 'q u e st ion </w>': 1, 'an s w e r ing . </w>': 1, 'T op ic s</w>': 1, 'b e</w>': 1, 'c o ve r ed</w>': 1, 'in c l u d e</w>': 1, 'p a r t - of - s p e e c h </w>': 1, 't ag g ing,</w>': 1, 'n - g r a m </w>': 1, 'language</w>': 2, 'm o de l l ing,</w>': 1, 's y nt a c t ic </w>': 1, 'p a r s ing</w>': 1, 'de e p </w>': 1, 'l e a r n ing . </w>': 1, 'p r o g r a m m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h op s ,</w>': 1, 'a ss i g n m e nt s</w>': 1, 'in st al l ation </w>': 1, 'at </w>': 1, 'h o m e . </w>': 1}\n",
      "Iter: 36\n",
      "Best pair: ('c', 'l')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 3, 'h': 6, 'e': 12, '</w>': 21, 'a': 13, 'i': 6, 'm': 12, 's': 10, 'f': 1, 'o': 7, 'r': 14, 't': 10, 'u': 7, 'b': 2, 'j': 1, 'c': 6, 'd': 4, 'n': 5, 'v': 0, 'l': 7, 'p': 8, 'g': 5, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 3, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 5, 'e</w>': 13, 'an': 7, 's</w>': 7, 'ing': 2, 'or': 3, 'on': 2, 'at': 2, ',</w>': 4, 'd</w>': 3, 'ion': 1, 'for': 1, 'th': 5, 'de': 5, 'ation': 5, 'for</w>': 4, 'st': 4, 'ing</w>': 4, 'in</w>': 4, 'us': 4, 'ang': 1, 'ag': 1, 'ic': 4, 'is</w>': 3, 'nt': 3, 've': 3, 'op': 3, 'of': 3, 'al': 3, 'ed</w>': 3, 'lang': 0, 'langu': 0, 'languag': 0, 'language</w>': 3, 'ss': 3, 'ing,</w>': 3, 'cl': 3})\n",
      "Number of tokens: 68\n",
      "==========\n",
      "vocab,  {'T h e</w>': 2, 'a i m s</w>': 1, 'for</w>': 4, 'th is</w>': 1, 's u b j e c t </w>': 1, 'is</w>': 2, 'st u de nt s</w>': 1, 't o </w>': 2, 'de ve l op </w>': 1, 'an </w>': 1, 'u n de r st an d ing</w>': 1, 'of </w>': 2, 'th e</w>': 2, 'm a in</w>': 1, 'al g or i th m s</w>': 1, 'us ed</w>': 2, 'in</w>': 3, 'n at u r al language</w>': 1, 'p r o c e ss ing,</w>': 1, 'us e</w>': 2, 'a </w>': 1, 'd i ve r s e</w>': 1, 'r ang e</w>': 1, 'a p p l ic ation s</w>': 1, 'in cl u d ing</w>': 1, 't e x t </w>': 1, 'cl a ss i f ic ation ,</w>': 1, 'm a c h in e</w>': 1, 't r an s l ation ,</w>': 1, 'an d</w>': 3, 'q u e st ion </w>': 1, 'an s w e r ing . </w>': 1, 'T op ic s</w>': 1, 'b e</w>': 1, 'c o ve r ed</w>': 1, 'in cl u d e</w>': 1, 'p a r t - of - s p e e c h </w>': 1, 't ag g ing,</w>': 1, 'n - g r a m </w>': 1, 'language</w>': 2, 'm o de l l ing,</w>': 1, 's y nt a c t ic </w>': 1, 'p a r s ing</w>': 1, 'de e p </w>': 1, 'l e a r n ing . </w>': 1, 'p r o g r a m m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h op s ,</w>': 1, 'a ss i g n m e nt s</w>': 1, 'in st al l ation </w>': 1, 'at </w>': 1, 'h o m e . </w>': 1}\n",
      "Iter: 37\n",
      "Best pair: ('an', 'd</w>')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 3, 'h': 6, 'e': 12, '</w>': 21, 'a': 13, 'i': 6, 'm': 12, 's': 10, 'f': 1, 'o': 7, 'r': 14, 't': 10, 'u': 7, 'b': 2, 'j': 1, 'c': 6, 'd': 4, 'n': 5, 'v': 0, 'l': 7, 'p': 8, 'g': 5, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 3, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 5, 'e</w>': 13, 'an': 4, 's</w>': 7, 'ing': 2, 'or': 3, 'on': 2, 'at': 2, ',</w>': 4, 'd</w>': 0, 'ion': 1, 'for': 1, 'th': 5, 'de': 5, 'ation': 5, 'for</w>': 4, 'st': 4, 'ing</w>': 4, 'in</w>': 4, 'us': 4, 'ang': 1, 'ag': 1, 'ic': 4, 'is</w>': 3, 'nt': 3, 've': 3, 'op': 3, 'of': 3, 'al': 3, 'ed</w>': 3, 'lang': 0, 'langu': 0, 'languag': 0, 'language</w>': 3, 'ss': 3, 'ing,</w>': 3, 'cl': 3, 'and</w>': 3})\n",
      "Number of tokens: 69\n",
      "==========\n",
      "vocab,  {'T h e</w>': 2, 'a i m s</w>': 1, 'for</w>': 4, 'th is</w>': 1, 's u b j e c t </w>': 1, 'is</w>': 2, 'st u de nt s</w>': 1, 't o </w>': 2, 'de ve l op </w>': 1, 'an </w>': 1, 'u n de r st an d ing</w>': 1, 'of </w>': 2, 'th e</w>': 2, 'm a in</w>': 1, 'al g or i th m s</w>': 1, 'us ed</w>': 2, 'in</w>': 3, 'n at u r al language</w>': 1, 'p r o c e ss ing,</w>': 1, 'us e</w>': 2, 'a </w>': 1, 'd i ve r s e</w>': 1, 'r ang e</w>': 1, 'a p p l ic ation s</w>': 1, 'in cl u d ing</w>': 1, 't e x t </w>': 1, 'cl a ss i f ic ation ,</w>': 1, 'm a c h in e</w>': 1, 't r an s l ation ,</w>': 1, 'and</w>': 3, 'q u e st ion </w>': 1, 'an s w e r ing . </w>': 1, 'T op ic s</w>': 1, 'b e</w>': 1, 'c o ve r ed</w>': 1, 'in cl u d e</w>': 1, 'p a r t - of - s p e e c h </w>': 1, 't ag g ing,</w>': 1, 'n - g r a m </w>': 1, 'language</w>': 2, 'm o de l l ing,</w>': 1, 's y nt a c t ic </w>': 1, 'p a r s ing</w>': 1, 'de e p </w>': 1, 'l e a r n ing . </w>': 1, 'p r o g r a m m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h op s ,</w>': 1, 'a ss i g n m e nt s</w>': 1, 'in st al l ation </w>': 1, 'at </w>': 1, 'h o m e . </w>': 1}\n",
      "Iter: 38\n",
      "Best pair: ('.', '</w>')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 3, 'h': 6, 'e': 12, '</w>': 18, 'a': 13, 'i': 6, 'm': 12, 's': 10, 'f': 1, 'o': 7, 'r': 14, 't': 10, 'u': 7, 'b': 2, 'j': 1, 'c': 6, 'd': 4, 'n': 5, 'v': 0, 'l': 7, 'p': 8, 'g': 5, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 0, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 5, 'e</w>': 13, 'an': 4, 's</w>': 7, 'ing': 2, 'or': 3, 'on': 2, 'at': 2, ',</w>': 4, 'd</w>': 0, 'ion': 1, 'for': 1, 'th': 5, 'de': 5, 'ation': 5, 'for</w>': 4, 'st': 4, 'ing</w>': 4, 'in</w>': 4, 'us': 4, 'ang': 1, 'ag': 1, 'ic': 4, 'is</w>': 3, 'nt': 3, 've': 3, 'op': 3, 'of': 3, 'al': 3, 'ed</w>': 3, 'lang': 0, 'langu': 0, 'languag': 0, 'language</w>': 3, 'ss': 3, 'ing,</w>': 3, 'cl': 3, 'and</w>': 3, '.</w>': 3})\n",
      "Number of tokens: 70\n",
      "==========\n",
      "vocab,  {'T h e</w>': 2, 'a i m s</w>': 1, 'for</w>': 4, 'th is</w>': 1, 's u b j e c t </w>': 1, 'is</w>': 2, 'st u de nt s</w>': 1, 't o </w>': 2, 'de ve l op </w>': 1, 'an </w>': 1, 'u n de r st an d ing</w>': 1, 'of </w>': 2, 'th e</w>': 2, 'm a in</w>': 1, 'al g or i th m s</w>': 1, 'us ed</w>': 2, 'in</w>': 3, 'n at u r al language</w>': 1, 'p r o c e ss ing,</w>': 1, 'us e</w>': 2, 'a </w>': 1, 'd i ve r s e</w>': 1, 'r ang e</w>': 1, 'a p p l ic ation s</w>': 1, 'in cl u d ing</w>': 1, 't e x t </w>': 1, 'cl a ss i f ic ation ,</w>': 1, 'm a c h in e</w>': 1, 't r an s l ation ,</w>': 1, 'and</w>': 3, 'q u e st ion </w>': 1, 'an s w e r ing .</w>': 1, 'T op ic s</w>': 1, 'b e</w>': 1, 'c o ve r ed</w>': 1, 'in cl u d e</w>': 1, 'p a r t - of - s p e e c h </w>': 1, 't ag g ing,</w>': 1, 'n - g r a m </w>': 1, 'language</w>': 2, 'm o de l l ing,</w>': 1, 's y nt a c t ic </w>': 1, 'p a r s ing</w>': 1, 'de e p </w>': 1, 'l e a r n ing .</w>': 1, 'p r o g r a m m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h op s ,</w>': 1, 'a ss i g n m e nt s</w>': 1, 'in st al l ation </w>': 1, 'at </w>': 1, 'h o m e .</w>': 1}\n",
      "Iter: 39\n",
      "Best pair: ('a', 'r')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 3, 'h': 6, 'e': 12, '</w>': 18, 'a': 10, 'i': 6, 'm': 12, 's': 10, 'f': 1, 'o': 7, 'r': 11, 't': 10, 'u': 7, 'b': 2, 'j': 1, 'c': 6, 'd': 4, 'n': 5, 'v': 0, 'l': 7, 'p': 8, 'g': 5, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 0, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 5, 'e</w>': 13, 'an': 4, 's</w>': 7, 'ing': 2, 'or': 3, 'on': 2, 'at': 2, ',</w>': 4, 'd</w>': 0, 'ion': 1, 'for': 1, 'th': 5, 'de': 5, 'ation': 5, 'for</w>': 4, 'st': 4, 'ing</w>': 4, 'in</w>': 4, 'us': 4, 'ang': 1, 'ag': 1, 'ic': 4, 'is</w>': 3, 'nt': 3, 've': 3, 'op': 3, 'of': 3, 'al': 3, 'ed</w>': 3, 'lang': 0, 'langu': 0, 'languag': 0, 'language</w>': 3, 'ss': 3, 'ing,</w>': 3, 'cl': 3, 'and</w>': 3, '.</w>': 3, 'ar': 3})\n",
      "Number of tokens: 71\n",
      "==========\n",
      "vocab,  {'T h e</w>': 2, 'a i m s</w>': 1, 'for</w>': 4, 'th is</w>': 1, 's u b j e c t </w>': 1, 'is</w>': 2, 'st u de nt s</w>': 1, 't o </w>': 2, 'de ve l op </w>': 1, 'an </w>': 1, 'u n de r st an d ing</w>': 1, 'of </w>': 2, 'th e</w>': 2, 'm a in</w>': 1, 'al g or i th m s</w>': 1, 'us ed</w>': 2, 'in</w>': 3, 'n at u r al language</w>': 1, 'p r o c e ss ing,</w>': 1, 'us e</w>': 2, 'a </w>': 1, 'd i ve r s e</w>': 1, 'r ang e</w>': 1, 'a p p l ic ation s</w>': 1, 'in cl u d ing</w>': 1, 't e x t </w>': 1, 'cl a ss i f ic ation ,</w>': 1, 'm a c h in e</w>': 1, 't r an s l ation ,</w>': 1, 'and</w>': 3, 'q u e st ion </w>': 1, 'an s w e r ing .</w>': 1, 'T op ic s</w>': 1, 'b e</w>': 1, 'c o ve r ed</w>': 1, 'in cl u d e</w>': 1, 'p ar t - of - s p e e c h </w>': 1, 't ag g ing,</w>': 1, 'n - g r a m </w>': 1, 'language</w>': 2, 'm o de l l ing,</w>': 1, 's y nt a c t ic </w>': 1, 'p ar s ing</w>': 1, 'de e p </w>': 1, 'l e ar n ing .</w>': 1, 'p r o g r a m m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h op s ,</w>': 1, 'a ss i g n m e nt s</w>': 1, 'in st al l ation </w>': 1, 'at </w>': 1, 'h o m e .</w>': 1}\n",
      "Iter: 40\n",
      "Best pair: ('T', 'h')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 1, 'h': 4, 'e': 12, '</w>': 18, 'a': 10, 'i': 6, 'm': 12, 's': 10, 'f': 1, 'o': 7, 'r': 11, 't': 10, 'u': 7, 'b': 2, 'j': 1, 'c': 6, 'd': 4, 'n': 5, 'v': 0, 'l': 7, 'p': 8, 'g': 5, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 0, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 5, 'e</w>': 13, 'an': 4, 's</w>': 7, 'ing': 2, 'or': 3, 'on': 2, 'at': 2, ',</w>': 4, 'd</w>': 0, 'ion': 1, 'for': 1, 'th': 5, 'de': 5, 'ation': 5, 'for</w>': 4, 'st': 4, 'ing</w>': 4, 'in</w>': 4, 'us': 4, 'ang': 1, 'ag': 1, 'ic': 4, 'is</w>': 3, 'nt': 3, 've': 3, 'op': 3, 'of': 3, 'al': 3, 'ed</w>': 3, 'lang': 0, 'langu': 0, 'languag': 0, 'language</w>': 3, 'ss': 3, 'ing,</w>': 3, 'cl': 3, 'and</w>': 3, '.</w>': 3, 'ar': 3, 'Th': 2})\n",
      "Number of tokens: 72\n",
      "==========\n",
      "vocab,  {'Th e</w>': 2, 'a i m s</w>': 1, 'for</w>': 4, 'th is</w>': 1, 's u b j e c t </w>': 1, 'is</w>': 2, 'st u de nt s</w>': 1, 't o </w>': 2, 'de ve l op </w>': 1, 'an </w>': 1, 'u n de r st an d ing</w>': 1, 'of </w>': 2, 'th e</w>': 2, 'm a in</w>': 1, 'al g or i th m s</w>': 1, 'us ed</w>': 2, 'in</w>': 3, 'n at u r al language</w>': 1, 'p r o c e ss ing,</w>': 1, 'us e</w>': 2, 'a </w>': 1, 'd i ve r s e</w>': 1, 'r ang e</w>': 1, 'a p p l ic ation s</w>': 1, 'in cl u d ing</w>': 1, 't e x t </w>': 1, 'cl a ss i f ic ation ,</w>': 1, 'm a c h in e</w>': 1, 't r an s l ation ,</w>': 1, 'and</w>': 3, 'q u e st ion </w>': 1, 'an s w e r ing .</w>': 1, 'T op ic s</w>': 1, 'b e</w>': 1, 'c o ve r ed</w>': 1, 'in cl u d e</w>': 1, 'p ar t - of - s p e e c h </w>': 1, 't ag g ing,</w>': 1, 'n - g r a m </w>': 1, 'language</w>': 2, 'm o de l l ing,</w>': 1, 's y nt a c t ic </w>': 1, 'p ar s ing</w>': 1, 'de e p </w>': 1, 'l e ar n ing .</w>': 1, 'p r o g r a m m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h op s ,</w>': 1, 'a ss i g n m e nt s</w>': 1, 'in st al l ation </w>': 1, 'at </w>': 1, 'h o m e .</w>': 1}\n",
      "Iter: 41\n",
      "Best pair: ('Th', 'e</w>')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 1, 'h': 4, 'e': 12, '</w>': 18, 'a': 10, 'i': 6, 'm': 12, 's': 10, 'f': 1, 'o': 7, 'r': 11, 't': 10, 'u': 7, 'b': 2, 'j': 1, 'c': 6, 'd': 4, 'n': 5, 'v': 0, 'l': 7, 'p': 8, 'g': 5, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 0, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 5, 'e</w>': 11, 'an': 4, 's</w>': 7, 'ing': 2, 'or': 3, 'on': 2, 'at': 2, ',</w>': 4, 'd</w>': 0, 'ion': 1, 'for': 1, 'th': 5, 'de': 5, 'ation': 5, 'for</w>': 4, 'st': 4, 'ing</w>': 4, 'in</w>': 4, 'us': 4, 'ang': 1, 'ag': 1, 'ic': 4, 'is</w>': 3, 'nt': 3, 've': 3, 'op': 3, 'of': 3, 'al': 3, 'ed</w>': 3, 'lang': 0, 'langu': 0, 'languag': 0, 'language</w>': 3, 'ss': 3, 'ing,</w>': 3, 'cl': 3, 'and</w>': 3, '.</w>': 3, 'ar': 3, 'Th': 0, 'The</w>': 2})\n",
      "Number of tokens: 73\n",
      "==========\n",
      "vocab,  {'The</w>': 2, 'a i m s</w>': 1, 'for</w>': 4, 'th is</w>': 1, 's u b j e c t </w>': 1, 'is</w>': 2, 'st u de nt s</w>': 1, 't o </w>': 2, 'de ve l op </w>': 1, 'an </w>': 1, 'u n de r st an d ing</w>': 1, 'of </w>': 2, 'th e</w>': 2, 'm a in</w>': 1, 'al g or i th m s</w>': 1, 'us ed</w>': 2, 'in</w>': 3, 'n at u r al language</w>': 1, 'p r o c e ss ing,</w>': 1, 'us e</w>': 2, 'a </w>': 1, 'd i ve r s e</w>': 1, 'r ang e</w>': 1, 'a p p l ic ation s</w>': 1, 'in cl u d ing</w>': 1, 't e x t </w>': 1, 'cl a ss i f ic ation ,</w>': 1, 'm a c h in e</w>': 1, 't r an s l ation ,</w>': 1, 'and</w>': 3, 'q u e st ion </w>': 1, 'an s w e r ing .</w>': 1, 'T op ic s</w>': 1, 'b e</w>': 1, 'c o ve r ed</w>': 1, 'in cl u d e</w>': 1, 'p ar t - of - s p e e c h </w>': 1, 't ag g ing,</w>': 1, 'n - g r a m </w>': 1, 'language</w>': 2, 'm o de l l ing,</w>': 1, 's y nt a c t ic </w>': 1, 'p ar s ing</w>': 1, 'de e p </w>': 1, 'l e ar n ing .</w>': 1, 'p r o g r a m m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h op s ,</w>': 1, 'a ss i g n m e nt s</w>': 1, 'in st al l ation </w>': 1, 'at </w>': 1, 'h o m e .</w>': 1}\n",
      "Iter: 42\n",
      "Best pair: ('m', 's</w>')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 1, 'h': 4, 'e': 12, '</w>': 18, 'a': 10, 'i': 6, 'm': 10, 's': 10, 'f': 1, 'o': 7, 'r': 11, 't': 10, 'u': 7, 'b': 2, 'j': 1, 'c': 6, 'd': 4, 'n': 5, 'v': 0, 'l': 7, 'p': 8, 'g': 5, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 0, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 5, 'e</w>': 11, 'an': 4, 's</w>': 5, 'ing': 2, 'or': 3, 'on': 2, 'at': 2, ',</w>': 4, 'd</w>': 0, 'ion': 1, 'for': 1, 'th': 5, 'de': 5, 'ation': 5, 'for</w>': 4, 'st': 4, 'ing</w>': 4, 'in</w>': 4, 'us': 4, 'ang': 1, 'ag': 1, 'ic': 4, 'is</w>': 3, 'nt': 3, 've': 3, 'op': 3, 'of': 3, 'al': 3, 'ed</w>': 3, 'lang': 0, 'langu': 0, 'languag': 0, 'language</w>': 3, 'ss': 3, 'ing,</w>': 3, 'cl': 3, 'and</w>': 3, '.</w>': 3, 'ar': 3, 'Th': 0, 'The</w>': 2, 'ms</w>': 2})\n",
      "Number of tokens: 74\n",
      "==========\n",
      "vocab,  {'The</w>': 2, 'a i ms</w>': 1, 'for</w>': 4, 'th is</w>': 1, 's u b j e c t </w>': 1, 'is</w>': 2, 'st u de nt s</w>': 1, 't o </w>': 2, 'de ve l op </w>': 1, 'an </w>': 1, 'u n de r st an d ing</w>': 1, 'of </w>': 2, 'th e</w>': 2, 'm a in</w>': 1, 'al g or i th ms</w>': 1, 'us ed</w>': 2, 'in</w>': 3, 'n at u r al language</w>': 1, 'p r o c e ss ing,</w>': 1, 'us e</w>': 2, 'a </w>': 1, 'd i ve r s e</w>': 1, 'r ang e</w>': 1, 'a p p l ic ation s</w>': 1, 'in cl u d ing</w>': 1, 't e x t </w>': 1, 'cl a ss i f ic ation ,</w>': 1, 'm a c h in e</w>': 1, 't r an s l ation ,</w>': 1, 'and</w>': 3, 'q u e st ion </w>': 1, 'an s w e r ing .</w>': 1, 'T op ic s</w>': 1, 'b e</w>': 1, 'c o ve r ed</w>': 1, 'in cl u d e</w>': 1, 'p ar t - of - s p e e c h </w>': 1, 't ag g ing,</w>': 1, 'n - g r a m </w>': 1, 'language</w>': 2, 'm o de l l ing,</w>': 1, 's y nt a c t ic </w>': 1, 'p ar s ing</w>': 1, 'de e p </w>': 1, 'l e ar n ing .</w>': 1, 'p r o g r a m m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h op s ,</w>': 1, 'a ss i g n m e nt s</w>': 1, 'in st al l ation </w>': 1, 'at </w>': 1, 'h o m e .</w>': 1}\n",
      "Iter: 43\n",
      "Best pair: ('e', 'c')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 1, 'h': 4, 'e': 10, '</w>': 18, 'a': 10, 'i': 6, 'm': 10, 's': 10, 'f': 1, 'o': 7, 'r': 11, 't': 10, 'u': 7, 'b': 2, 'j': 1, 'c': 4, 'd': 4, 'n': 5, 'v': 0, 'l': 7, 'p': 8, 'g': 5, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 0, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 5, 'e</w>': 11, 'an': 4, 's</w>': 5, 'ing': 2, 'or': 3, 'on': 2, 'at': 2, ',</w>': 4, 'd</w>': 0, 'ion': 1, 'for': 1, 'th': 5, 'de': 5, 'ation': 5, 'for</w>': 4, 'st': 4, 'ing</w>': 4, 'in</w>': 4, 'us': 4, 'ang': 1, 'ag': 1, 'ic': 4, 'is</w>': 3, 'nt': 3, 've': 3, 'op': 3, 'of': 3, 'al': 3, 'ed</w>': 3, 'lang': 0, 'langu': 0, 'languag': 0, 'language</w>': 3, 'ss': 3, 'ing,</w>': 3, 'cl': 3, 'and</w>': 3, '.</w>': 3, 'ar': 3, 'Th': 0, 'The</w>': 2, 'ms</w>': 2, 'ec': 2})\n",
      "Number of tokens: 75\n",
      "==========\n",
      "vocab,  {'The</w>': 2, 'a i ms</w>': 1, 'for</w>': 4, 'th is</w>': 1, 's u b j ec t </w>': 1, 'is</w>': 2, 'st u de nt s</w>': 1, 't o </w>': 2, 'de ve l op </w>': 1, 'an </w>': 1, 'u n de r st an d ing</w>': 1, 'of </w>': 2, 'th e</w>': 2, 'm a in</w>': 1, 'al g or i th ms</w>': 1, 'us ed</w>': 2, 'in</w>': 3, 'n at u r al language</w>': 1, 'p r o c e ss ing,</w>': 1, 'us e</w>': 2, 'a </w>': 1, 'd i ve r s e</w>': 1, 'r ang e</w>': 1, 'a p p l ic ation s</w>': 1, 'in cl u d ing</w>': 1, 't e x t </w>': 1, 'cl a ss i f ic ation ,</w>': 1, 'm a c h in e</w>': 1, 't r an s l ation ,</w>': 1, 'and</w>': 3, 'q u e st ion </w>': 1, 'an s w e r ing .</w>': 1, 'T op ic s</w>': 1, 'b e</w>': 1, 'c o ve r ed</w>': 1, 'in cl u d e</w>': 1, 'p ar t - of - s p e ec h </w>': 1, 't ag g ing,</w>': 1, 'n - g r a m </w>': 1, 'language</w>': 2, 'm o de l l ing,</w>': 1, 's y nt a c t ic </w>': 1, 'p ar s ing</w>': 1, 'de e p </w>': 1, 'l e ar n ing .</w>': 1, 'p r o g r a m m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h op s ,</w>': 1, 'a ss i g n m e nt s</w>': 1, 'in st al l ation </w>': 1, 'at </w>': 1, 'h o m e .</w>': 1}\n",
      "Iter: 44\n",
      "Best pair: ('t', '</w>')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 1, 'h': 4, 'e': 10, '</w>': 16, 'a': 10, 'i': 6, 'm': 10, 's': 10, 'f': 1, 'o': 7, 'r': 11, 't': 8, 'u': 7, 'b': 2, 'j': 1, 'c': 4, 'd': 4, 'n': 5, 'v': 0, 'l': 7, 'p': 8, 'g': 5, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 0, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 5, 'e</w>': 11, 'an': 4, 's</w>': 5, 'ing': 2, 'or': 3, 'on': 2, 'at': 2, ',</w>': 4, 'd</w>': 0, 'ion': 1, 'for': 1, 'th': 5, 'de': 5, 'ation': 5, 'for</w>': 4, 'st': 4, 'ing</w>': 4, 'in</w>': 4, 'us': 4, 'ang': 1, 'ag': 1, 'ic': 4, 'is</w>': 3, 'nt': 3, 've': 3, 'op': 3, 'of': 3, 'al': 3, 'ed</w>': 3, 'lang': 0, 'langu': 0, 'languag': 0, 'language</w>': 3, 'ss': 3, 'ing,</w>': 3, 'cl': 3, 'and</w>': 3, '.</w>': 3, 'ar': 3, 'Th': 0, 'The</w>': 2, 'ms</w>': 2, 'ec': 2, 't</w>': 2})\n",
      "Number of tokens: 76\n",
      "==========\n",
      "vocab,  {'The</w>': 2, 'a i ms</w>': 1, 'for</w>': 4, 'th is</w>': 1, 's u b j ec t</w>': 1, 'is</w>': 2, 'st u de nt s</w>': 1, 't o </w>': 2, 'de ve l op </w>': 1, 'an </w>': 1, 'u n de r st an d ing</w>': 1, 'of </w>': 2, 'th e</w>': 2, 'm a in</w>': 1, 'al g or i th ms</w>': 1, 'us ed</w>': 2, 'in</w>': 3, 'n at u r al language</w>': 1, 'p r o c e ss ing,</w>': 1, 'us e</w>': 2, 'a </w>': 1, 'd i ve r s e</w>': 1, 'r ang e</w>': 1, 'a p p l ic ation s</w>': 1, 'in cl u d ing</w>': 1, 't e x t</w>': 1, 'cl a ss i f ic ation ,</w>': 1, 'm a c h in e</w>': 1, 't r an s l ation ,</w>': 1, 'and</w>': 3, 'q u e st ion </w>': 1, 'an s w e r ing .</w>': 1, 'T op ic s</w>': 1, 'b e</w>': 1, 'c o ve r ed</w>': 1, 'in cl u d e</w>': 1, 'p ar t - of - s p e ec h </w>': 1, 't ag g ing,</w>': 1, 'n - g r a m </w>': 1, 'language</w>': 2, 'm o de l l ing,</w>': 1, 's y nt a c t ic </w>': 1, 'p ar s ing</w>': 1, 'de e p </w>': 1, 'l e ar n ing .</w>': 1, 'p r o g r a m m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h op s ,</w>': 1, 'a ss i g n m e nt s</w>': 1, 'in st al l ation </w>': 1, 'at </w>': 1, 'h o m e .</w>': 1}\n",
      "Iter: 45\n",
      "Best pair: ('nt', 's</w>')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 1, 'h': 4, 'e': 10, '</w>': 16, 'a': 10, 'i': 6, 'm': 10, 's': 10, 'f': 1, 'o': 7, 'r': 11, 't': 8, 'u': 7, 'b': 2, 'j': 1, 'c': 4, 'd': 4, 'n': 5, 'v': 0, 'l': 7, 'p': 8, 'g': 5, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 0, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 5, 'e</w>': 11, 'an': 4, 's</w>': 3, 'ing': 2, 'or': 3, 'on': 2, 'at': 2, ',</w>': 4, 'd</w>': 0, 'ion': 1, 'for': 1, 'th': 5, 'de': 5, 'ation': 5, 'for</w>': 4, 'st': 4, 'ing</w>': 4, 'in</w>': 4, 'us': 4, 'ang': 1, 'ag': 1, 'ic': 4, 'is</w>': 3, 'nt': 1, 've': 3, 'op': 3, 'of': 3, 'al': 3, 'ed</w>': 3, 'lang': 0, 'langu': 0, 'languag': 0, 'language</w>': 3, 'ss': 3, 'ing,</w>': 3, 'cl': 3, 'and</w>': 3, '.</w>': 3, 'ar': 3, 'Th': 0, 'The</w>': 2, 'ms</w>': 2, 'ec': 2, 't</w>': 2, 'nts</w>': 2})\n",
      "Number of tokens: 77\n",
      "==========\n",
      "vocab,  {'The</w>': 2, 'a i ms</w>': 1, 'for</w>': 4, 'th is</w>': 1, 's u b j ec t</w>': 1, 'is</w>': 2, 'st u de nts</w>': 1, 't o </w>': 2, 'de ve l op </w>': 1, 'an </w>': 1, 'u n de r st an d ing</w>': 1, 'of </w>': 2, 'th e</w>': 2, 'm a in</w>': 1, 'al g or i th ms</w>': 1, 'us ed</w>': 2, 'in</w>': 3, 'n at u r al language</w>': 1, 'p r o c e ss ing,</w>': 1, 'us e</w>': 2, 'a </w>': 1, 'd i ve r s e</w>': 1, 'r ang e</w>': 1, 'a p p l ic ation s</w>': 1, 'in cl u d ing</w>': 1, 't e x t</w>': 1, 'cl a ss i f ic ation ,</w>': 1, 'm a c h in e</w>': 1, 't r an s l ation ,</w>': 1, 'and</w>': 3, 'q u e st ion </w>': 1, 'an s w e r ing .</w>': 1, 'T op ic s</w>': 1, 'b e</w>': 1, 'c o ve r ed</w>': 1, 'in cl u d e</w>': 1, 'p ar t - of - s p e ec h </w>': 1, 't ag g ing,</w>': 1, 'n - g r a m </w>': 1, 'language</w>': 2, 'm o de l l ing,</w>': 1, 's y nt a c t ic </w>': 1, 'p ar s ing</w>': 1, 'de e p </w>': 1, 'l e ar n ing .</w>': 1, 'p r o g r a m m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h op s ,</w>': 1, 'a ss i g n m e nts</w>': 1, 'in st al l ation </w>': 1, 'at </w>': 1, 'h o m e .</w>': 1}\n",
      "Iter: 46\n",
      "Best pair: ('t', 'o')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 1, 'h': 4, 'e': 10, '</w>': 16, 'a': 10, 'i': 6, 'm': 10, 's': 10, 'f': 1, 'o': 5, 'r': 11, 't': 6, 'u': 7, 'b': 2, 'j': 1, 'c': 4, 'd': 4, 'n': 5, 'v': 0, 'l': 7, 'p': 8, 'g': 5, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 0, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 5, 'e</w>': 11, 'an': 4, 's</w>': 3, 'ing': 2, 'or': 3, 'on': 2, 'at': 2, ',</w>': 4, 'd</w>': 0, 'ion': 1, 'for': 1, 'th': 5, 'de': 5, 'ation': 5, 'for</w>': 4, 'st': 4, 'ing</w>': 4, 'in</w>': 4, 'us': 4, 'ang': 1, 'ag': 1, 'ic': 4, 'is</w>': 3, 'nt': 1, 've': 3, 'op': 3, 'of': 3, 'al': 3, 'ed</w>': 3, 'lang': 0, 'langu': 0, 'languag': 0, 'language</w>': 3, 'ss': 3, 'ing,</w>': 3, 'cl': 3, 'and</w>': 3, '.</w>': 3, 'ar': 3, 'Th': 0, 'The</w>': 2, 'ms</w>': 2, 'ec': 2, 't</w>': 2, 'nts</w>': 2, 'to': 2})\n",
      "Number of tokens: 78\n",
      "==========\n",
      "vocab,  {'The</w>': 2, 'a i ms</w>': 1, 'for</w>': 4, 'th is</w>': 1, 's u b j ec t</w>': 1, 'is</w>': 2, 'st u de nts</w>': 1, 'to </w>': 2, 'de ve l op </w>': 1, 'an </w>': 1, 'u n de r st an d ing</w>': 1, 'of </w>': 2, 'th e</w>': 2, 'm a in</w>': 1, 'al g or i th ms</w>': 1, 'us ed</w>': 2, 'in</w>': 3, 'n at u r al language</w>': 1, 'p r o c e ss ing,</w>': 1, 'us e</w>': 2, 'a </w>': 1, 'd i ve r s e</w>': 1, 'r ang e</w>': 1, 'a p p l ic ation s</w>': 1, 'in cl u d ing</w>': 1, 't e x t</w>': 1, 'cl a ss i f ic ation ,</w>': 1, 'm a c h in e</w>': 1, 't r an s l ation ,</w>': 1, 'and</w>': 3, 'q u e st ion </w>': 1, 'an s w e r ing .</w>': 1, 'T op ic s</w>': 1, 'b e</w>': 1, 'c o ve r ed</w>': 1, 'in cl u d e</w>': 1, 'p ar t - of - s p e ec h </w>': 1, 't ag g ing,</w>': 1, 'n - g r a m </w>': 1, 'language</w>': 2, 'm o de l l ing,</w>': 1, 's y nt a c t ic </w>': 1, 'p ar s ing</w>': 1, 'de e p </w>': 1, 'l e ar n ing .</w>': 1, 'p r o g r a m m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h op s ,</w>': 1, 'a ss i g n m e nts</w>': 1, 'in st al l ation </w>': 1, 'at </w>': 1, 'h o m e .</w>': 1}\n",
      "Iter: 47\n",
      "Best pair: ('to', '</w>')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 1, 'h': 4, 'e': 10, '</w>': 14, 'a': 10, 'i': 6, 'm': 10, 's': 10, 'f': 1, 'o': 5, 'r': 11, 't': 6, 'u': 7, 'b': 2, 'j': 1, 'c': 4, 'd': 4, 'n': 5, 'v': 0, 'l': 7, 'p': 8, 'g': 5, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 0, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 5, 'e</w>': 11, 'an': 4, 's</w>': 3, 'ing': 2, 'or': 3, 'on': 2, 'at': 2, ',</w>': 4, 'd</w>': 0, 'ion': 1, 'for': 1, 'th': 5, 'de': 5, 'ation': 5, 'for</w>': 4, 'st': 4, 'ing</w>': 4, 'in</w>': 4, 'us': 4, 'ang': 1, 'ag': 1, 'ic': 4, 'is</w>': 3, 'nt': 1, 've': 3, 'op': 3, 'of': 3, 'al': 3, 'ed</w>': 3, 'lang': 0, 'langu': 0, 'languag': 0, 'language</w>': 3, 'ss': 3, 'ing,</w>': 3, 'cl': 3, 'and</w>': 3, '.</w>': 3, 'ar': 3, 'Th': 0, 'The</w>': 2, 'ms</w>': 2, 'ec': 2, 't</w>': 2, 'nts</w>': 2, 'to': 0, 'to</w>': 2})\n",
      "Number of tokens: 79\n",
      "==========\n",
      "vocab,  {'The</w>': 2, 'a i ms</w>': 1, 'for</w>': 4, 'th is</w>': 1, 's u b j ec t</w>': 1, 'is</w>': 2, 'st u de nts</w>': 1, 'to</w>': 2, 'de ve l op </w>': 1, 'an </w>': 1, 'u n de r st an d ing</w>': 1, 'of </w>': 2, 'th e</w>': 2, 'm a in</w>': 1, 'al g or i th ms</w>': 1, 'us ed</w>': 2, 'in</w>': 3, 'n at u r al language</w>': 1, 'p r o c e ss ing,</w>': 1, 'us e</w>': 2, 'a </w>': 1, 'd i ve r s e</w>': 1, 'r ang e</w>': 1, 'a p p l ic ation s</w>': 1, 'in cl u d ing</w>': 1, 't e x t</w>': 1, 'cl a ss i f ic ation ,</w>': 1, 'm a c h in e</w>': 1, 't r an s l ation ,</w>': 1, 'and</w>': 3, 'q u e st ion </w>': 1, 'an s w e r ing .</w>': 1, 'T op ic s</w>': 1, 'b e</w>': 1, 'c o ve r ed</w>': 1, 'in cl u d e</w>': 1, 'p ar t - of - s p e ec h </w>': 1, 't ag g ing,</w>': 1, 'n - g r a m </w>': 1, 'language</w>': 2, 'm o de l l ing,</w>': 1, 's y nt a c t ic </w>': 1, 'p ar s ing</w>': 1, 'de e p </w>': 1, 'l e ar n ing .</w>': 1, 'p r o g r a m m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h op s ,</w>': 1, 'a ss i g n m e nts</w>': 1, 'in st al l ation </w>': 1, 'at </w>': 1, 'h o m e .</w>': 1}\n",
      "Iter: 48\n",
      "Best pair: ('d', 'ing</w>')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 1, 'h': 4, 'e': 10, '</w>': 14, 'a': 10, 'i': 6, 'm': 10, 's': 10, 'f': 1, 'o': 5, 'r': 11, 't': 6, 'u': 7, 'b': 2, 'j': 1, 'c': 4, 'd': 2, 'n': 5, 'v': 0, 'l': 7, 'p': 8, 'g': 5, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 0, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 5, 'e</w>': 11, 'an': 4, 's</w>': 3, 'ing': 2, 'or': 3, 'on': 2, 'at': 2, ',</w>': 4, 'd</w>': 0, 'ion': 1, 'for': 1, 'th': 5, 'de': 5, 'ation': 5, 'for</w>': 4, 'st': 4, 'ing</w>': 2, 'in</w>': 4, 'us': 4, 'ang': 1, 'ag': 1, 'ic': 4, 'is</w>': 3, 'nt': 1, 've': 3, 'op': 3, 'of': 3, 'al': 3, 'ed</w>': 3, 'lang': 0, 'langu': 0, 'languag': 0, 'language</w>': 3, 'ss': 3, 'ing,</w>': 3, 'cl': 3, 'and</w>': 3, '.</w>': 3, 'ar': 3, 'Th': 0, 'The</w>': 2, 'ms</w>': 2, 'ec': 2, 't</w>': 2, 'nts</w>': 2, 'to': 0, 'to</w>': 2, 'ding</w>': 2})\n",
      "Number of tokens: 80\n",
      "==========\n",
      "vocab,  {'The</w>': 2, 'a i ms</w>': 1, 'for</w>': 4, 'th is</w>': 1, 's u b j ec t</w>': 1, 'is</w>': 2, 'st u de nts</w>': 1, 'to</w>': 2, 'de ve l op </w>': 1, 'an </w>': 1, 'u n de r st an ding</w>': 1, 'of </w>': 2, 'th e</w>': 2, 'm a in</w>': 1, 'al g or i th ms</w>': 1, 'us ed</w>': 2, 'in</w>': 3, 'n at u r al language</w>': 1, 'p r o c e ss ing,</w>': 1, 'us e</w>': 2, 'a </w>': 1, 'd i ve r s e</w>': 1, 'r ang e</w>': 1, 'a p p l ic ation s</w>': 1, 'in cl u ding</w>': 1, 't e x t</w>': 1, 'cl a ss i f ic ation ,</w>': 1, 'm a c h in e</w>': 1, 't r an s l ation ,</w>': 1, 'and</w>': 3, 'q u e st ion </w>': 1, 'an s w e r ing .</w>': 1, 'T op ic s</w>': 1, 'b e</w>': 1, 'c o ve r ed</w>': 1, 'in cl u d e</w>': 1, 'p ar t - of - s p e ec h </w>': 1, 't ag g ing,</w>': 1, 'n - g r a m </w>': 1, 'language</w>': 2, 'm o de l l ing,</w>': 1, 's y nt a c t ic </w>': 1, 'p ar s ing</w>': 1, 'de e p </w>': 1, 'l e ar n ing .</w>': 1, 'p r o g r a m m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h op s ,</w>': 1, 'a ss i g n m e nts</w>': 1, 'in st al l ation </w>': 1, 'at </w>': 1, 'h o m e .</w>': 1}\n",
      "Iter: 49\n",
      "Best pair: ('of', '</w>')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 1, 'h': 4, 'e': 10, '</w>': 12, 'a': 10, 'i': 6, 'm': 10, 's': 10, 'f': 1, 'o': 5, 'r': 11, 't': 6, 'u': 7, 'b': 2, 'j': 1, 'c': 4, 'd': 2, 'n': 5, 'v': 0, 'l': 7, 'p': 8, 'g': 5, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 0, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 5, 'e</w>': 11, 'an': 4, 's</w>': 3, 'ing': 2, 'or': 3, 'on': 2, 'at': 2, ',</w>': 4, 'd</w>': 0, 'ion': 1, 'for': 1, 'th': 5, 'de': 5, 'ation': 5, 'for</w>': 4, 'st': 4, 'ing</w>': 2, 'in</w>': 4, 'us': 4, 'ang': 1, 'ag': 1, 'ic': 4, 'is</w>': 3, 'nt': 1, 've': 3, 'op': 3, 'of': 1, 'al': 3, 'ed</w>': 3, 'lang': 0, 'langu': 0, 'languag': 0, 'language</w>': 3, 'ss': 3, 'ing,</w>': 3, 'cl': 3, 'and</w>': 3, '.</w>': 3, 'ar': 3, 'Th': 0, 'The</w>': 2, 'ms</w>': 2, 'ec': 2, 't</w>': 2, 'nts</w>': 2, 'to': 0, 'to</w>': 2, 'ding</w>': 2, 'of</w>': 2})\n",
      "Number of tokens: 81\n",
      "==========\n",
      "vocab,  {'The</w>': 2, 'a i ms</w>': 1, 'for</w>': 4, 'th is</w>': 1, 's u b j ec t</w>': 1, 'is</w>': 2, 'st u de nts</w>': 1, 'to</w>': 2, 'de ve l op </w>': 1, 'an </w>': 1, 'u n de r st an ding</w>': 1, 'of</w>': 2, 'th e</w>': 2, 'm a in</w>': 1, 'al g or i th ms</w>': 1, 'us ed</w>': 2, 'in</w>': 3, 'n at u r al language</w>': 1, 'p r o c e ss ing,</w>': 1, 'us e</w>': 2, 'a </w>': 1, 'd i ve r s e</w>': 1, 'r ang e</w>': 1, 'a p p l ic ation s</w>': 1, 'in cl u ding</w>': 1, 't e x t</w>': 1, 'cl a ss i f ic ation ,</w>': 1, 'm a c h in e</w>': 1, 't r an s l ation ,</w>': 1, 'and</w>': 3, 'q u e st ion </w>': 1, 'an s w e r ing .</w>': 1, 'T op ic s</w>': 1, 'b e</w>': 1, 'c o ve r ed</w>': 1, 'in cl u d e</w>': 1, 'p ar t - of - s p e ec h </w>': 1, 't ag g ing,</w>': 1, 'n - g r a m </w>': 1, 'language</w>': 2, 'm o de l l ing,</w>': 1, 's y nt a c t ic </w>': 1, 'p ar s ing</w>': 1, 'de e p </w>': 1, 'l e ar n ing .</w>': 1, 'p r o g r a m m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h op s ,</w>': 1, 'a ss i g n m e nts</w>': 1, 'in st al l ation </w>': 1, 'at </w>': 1, 'h o m e .</w>': 1}\n",
      "Iter: 50\n",
      "Best pair: ('th', 'e</w>')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 1, 'h': 4, 'e': 10, '</w>': 12, 'a': 10, 'i': 6, 'm': 10, 's': 10, 'f': 1, 'o': 5, 'r': 11, 't': 6, 'u': 7, 'b': 2, 'j': 1, 'c': 4, 'd': 2, 'n': 5, 'v': 0, 'l': 7, 'p': 8, 'g': 5, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 0, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 5, 'e</w>': 9, 'an': 4, 's</w>': 3, 'ing': 2, 'or': 3, 'on': 2, 'at': 2, ',</w>': 4, 'd</w>': 0, 'ion': 1, 'for': 1, 'th': 3, 'de': 5, 'ation': 5, 'for</w>': 4, 'st': 4, 'ing</w>': 2, 'in</w>': 4, 'us': 4, 'ang': 1, 'ag': 1, 'ic': 4, 'is</w>': 3, 'nt': 1, 've': 3, 'op': 3, 'of': 1, 'al': 3, 'ed</w>': 3, 'lang': 0, 'langu': 0, 'languag': 0, 'language</w>': 3, 'ss': 3, 'ing,</w>': 3, 'cl': 3, 'and</w>': 3, '.</w>': 3, 'ar': 3, 'Th': 0, 'The</w>': 2, 'ms</w>': 2, 'ec': 2, 't</w>': 2, 'nts</w>': 2, 'to': 0, 'to</w>': 2, 'ding</w>': 2, 'of</w>': 2, 'the</w>': 2})\n",
      "Number of tokens: 82\n",
      "==========\n",
      "vocab,  {'The</w>': 2, 'a i ms</w>': 1, 'for</w>': 4, 'th is</w>': 1, 's u b j ec t</w>': 1, 'is</w>': 2, 'st u de nts</w>': 1, 'to</w>': 2, 'de ve l op </w>': 1, 'an </w>': 1, 'u n de r st an ding</w>': 1, 'of</w>': 2, 'the</w>': 2, 'm a in</w>': 1, 'al g or i th ms</w>': 1, 'us ed</w>': 2, 'in</w>': 3, 'n at u r al language</w>': 1, 'p r o c e ss ing,</w>': 1, 'us e</w>': 2, 'a </w>': 1, 'd i ve r s e</w>': 1, 'r ang e</w>': 1, 'a p p l ic ation s</w>': 1, 'in cl u ding</w>': 1, 't e x t</w>': 1, 'cl a ss i f ic ation ,</w>': 1, 'm a c h in e</w>': 1, 't r an s l ation ,</w>': 1, 'and</w>': 3, 'q u e st ion </w>': 1, 'an s w e r ing .</w>': 1, 'T op ic s</w>': 1, 'b e</w>': 1, 'c o ve r ed</w>': 1, 'in cl u d e</w>': 1, 'p ar t - of - s p e ec h </w>': 1, 't ag g ing,</w>': 1, 'n - g r a m </w>': 1, 'language</w>': 2, 'm o de l l ing,</w>': 1, 's y nt a c t ic </w>': 1, 'p ar s ing</w>': 1, 'de e p </w>': 1, 'l e ar n ing .</w>': 1, 'p r o g r a m m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h op s ,</w>': 1, 'a ss i g n m e nts</w>': 1, 'in st al l ation </w>': 1, 'at </w>': 1, 'h o m e .</w>': 1}\n",
      "Iter: 51\n",
      "Best pair: ('m', 'a')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 1, 'h': 4, 'e': 10, '</w>': 12, 'a': 8, 'i': 6, 'm': 8, 's': 10, 'f': 1, 'o': 5, 'r': 11, 't': 6, 'u': 7, 'b': 2, 'j': 1, 'c': 4, 'd': 2, 'n': 5, 'v': 0, 'l': 7, 'p': 8, 'g': 5, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 0, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 5, 'e</w>': 9, 'an': 4, 's</w>': 3, 'ing': 2, 'or': 3, 'on': 2, 'at': 2, ',</w>': 4, 'd</w>': 0, 'ion': 1, 'for': 1, 'th': 3, 'de': 5, 'ation': 5, 'for</w>': 4, 'st': 4, 'ing</w>': 2, 'in</w>': 4, 'us': 4, 'ang': 1, 'ag': 1, 'ic': 4, 'is</w>': 3, 'nt': 1, 've': 3, 'op': 3, 'of': 1, 'al': 3, 'ed</w>': 3, 'lang': 0, 'langu': 0, 'languag': 0, 'language</w>': 3, 'ss': 3, 'ing,</w>': 3, 'cl': 3, 'and</w>': 3, '.</w>': 3, 'ar': 3, 'Th': 0, 'The</w>': 2, 'ms</w>': 2, 'ec': 2, 't</w>': 2, 'nts</w>': 2, 'to': 0, 'to</w>': 2, 'ding</w>': 2, 'of</w>': 2, 'the</w>': 2, 'ma': 2})\n",
      "Number of tokens: 83\n",
      "==========\n",
      "vocab,  {'The</w>': 2, 'a i ms</w>': 1, 'for</w>': 4, 'th is</w>': 1, 's u b j ec t</w>': 1, 'is</w>': 2, 'st u de nts</w>': 1, 'to</w>': 2, 'de ve l op </w>': 1, 'an </w>': 1, 'u n de r st an ding</w>': 1, 'of</w>': 2, 'the</w>': 2, 'ma in</w>': 1, 'al g or i th ms</w>': 1, 'us ed</w>': 2, 'in</w>': 3, 'n at u r al language</w>': 1, 'p r o c e ss ing,</w>': 1, 'us e</w>': 2, 'a </w>': 1, 'd i ve r s e</w>': 1, 'r ang e</w>': 1, 'a p p l ic ation s</w>': 1, 'in cl u ding</w>': 1, 't e x t</w>': 1, 'cl a ss i f ic ation ,</w>': 1, 'ma c h in e</w>': 1, 't r an s l ation ,</w>': 1, 'and</w>': 3, 'q u e st ion </w>': 1, 'an s w e r ing .</w>': 1, 'T op ic s</w>': 1, 'b e</w>': 1, 'c o ve r ed</w>': 1, 'in cl u d e</w>': 1, 'p ar t - of - s p e ec h </w>': 1, 't ag g ing,</w>': 1, 'n - g r a m </w>': 1, 'language</w>': 2, 'm o de l l ing,</w>': 1, 's y nt a c t ic </w>': 1, 'p ar s ing</w>': 1, 'de e p </w>': 1, 'l e ar n ing .</w>': 1, 'p r o g r a m m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h op s ,</w>': 1, 'a ss i g n m e nts</w>': 1, 'in st al l ation </w>': 1, 'at </w>': 1, 'h o m e .</w>': 1}\n",
      "Iter: 52\n",
      "Best pair: ('us', 'ed</w>')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 1, 'h': 4, 'e': 10, '</w>': 12, 'a': 8, 'i': 6, 'm': 8, 's': 10, 'f': 1, 'o': 5, 'r': 11, 't': 6, 'u': 7, 'b': 2, 'j': 1, 'c': 4, 'd': 2, 'n': 5, 'v': 0, 'l': 7, 'p': 8, 'g': 5, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 0, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 5, 'e</w>': 9, 'an': 4, 's</w>': 3, 'ing': 2, 'or': 3, 'on': 2, 'at': 2, ',</w>': 4, 'd</w>': 0, 'ion': 1, 'for': 1, 'th': 3, 'de': 5, 'ation': 5, 'for</w>': 4, 'st': 4, 'ing</w>': 2, 'in</w>': 4, 'us': 2, 'ang': 1, 'ag': 1, 'ic': 4, 'is</w>': 3, 'nt': 1, 've': 3, 'op': 3, 'of': 1, 'al': 3, 'ed</w>': 1, 'lang': 0, 'langu': 0, 'languag': 0, 'language</w>': 3, 'ss': 3, 'ing,</w>': 3, 'cl': 3, 'and</w>': 3, '.</w>': 3, 'ar': 3, 'Th': 0, 'The</w>': 2, 'ms</w>': 2, 'ec': 2, 't</w>': 2, 'nts</w>': 2, 'to': 0, 'to</w>': 2, 'ding</w>': 2, 'of</w>': 2, 'the</w>': 2, 'ma': 2, 'used</w>': 2})\n",
      "Number of tokens: 84\n",
      "==========\n",
      "vocab,  {'The</w>': 2, 'a i ms</w>': 1, 'for</w>': 4, 'th is</w>': 1, 's u b j ec t</w>': 1, 'is</w>': 2, 'st u de nts</w>': 1, 'to</w>': 2, 'de ve l op </w>': 1, 'an </w>': 1, 'u n de r st an ding</w>': 1, 'of</w>': 2, 'the</w>': 2, 'ma in</w>': 1, 'al g or i th ms</w>': 1, 'used</w>': 2, 'in</w>': 3, 'n at u r al language</w>': 1, 'p r o c e ss ing,</w>': 1, 'us e</w>': 2, 'a </w>': 1, 'd i ve r s e</w>': 1, 'r ang e</w>': 1, 'a p p l ic ation s</w>': 1, 'in cl u ding</w>': 1, 't e x t</w>': 1, 'cl a ss i f ic ation ,</w>': 1, 'ma c h in e</w>': 1, 't r an s l ation ,</w>': 1, 'and</w>': 3, 'q u e st ion </w>': 1, 'an s w e r ing .</w>': 1, 'T op ic s</w>': 1, 'b e</w>': 1, 'c o ve r ed</w>': 1, 'in cl u d e</w>': 1, 'p ar t - of - s p e ec h </w>': 1, 't ag g ing,</w>': 1, 'n - g r a m </w>': 1, 'language</w>': 2, 'm o de l l ing,</w>': 1, 's y nt a c t ic </w>': 1, 'p ar s ing</w>': 1, 'de e p </w>': 1, 'l e ar n ing .</w>': 1, 'p r o g r a m m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h op s ,</w>': 1, 'a ss i g n m e nts</w>': 1, 'in st al l ation </w>': 1, 'at </w>': 1, 'h o m e .</w>': 1}\n",
      "Iter: 53\n",
      "Best pair: ('p', 'r')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 1, 'h': 4, 'e': 10, '</w>': 12, 'a': 8, 'i': 6, 'm': 8, 's': 10, 'f': 1, 'o': 5, 'r': 9, 't': 6, 'u': 7, 'b': 2, 'j': 1, 'c': 4, 'd': 2, 'n': 5, 'v': 0, 'l': 7, 'p': 6, 'g': 5, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 0, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 5, 'e</w>': 9, 'an': 4, 's</w>': 3, 'ing': 2, 'or': 3, 'on': 2, 'at': 2, ',</w>': 4, 'd</w>': 0, 'ion': 1, 'for': 1, 'th': 3, 'de': 5, 'ation': 5, 'for</w>': 4, 'st': 4, 'ing</w>': 2, 'in</w>': 4, 'us': 2, 'ang': 1, 'ag': 1, 'ic': 4, 'is</w>': 3, 'nt': 1, 've': 3, 'op': 3, 'of': 1, 'al': 3, 'ed</w>': 1, 'lang': 0, 'langu': 0, 'languag': 0, 'language</w>': 3, 'ss': 3, 'ing,</w>': 3, 'cl': 3, 'and</w>': 3, '.</w>': 3, 'ar': 3, 'Th': 0, 'The</w>': 2, 'ms</w>': 2, 'ec': 2, 't</w>': 2, 'nts</w>': 2, 'to': 0, 'to</w>': 2, 'ding</w>': 2, 'of</w>': 2, 'the</w>': 2, 'ma': 2, 'used</w>': 2, 'pr': 2})\n",
      "Number of tokens: 85\n",
      "==========\n",
      "vocab,  {'The</w>': 2, 'a i ms</w>': 1, 'for</w>': 4, 'th is</w>': 1, 's u b j ec t</w>': 1, 'is</w>': 2, 'st u de nts</w>': 1, 'to</w>': 2, 'de ve l op </w>': 1, 'an </w>': 1, 'u n de r st an ding</w>': 1, 'of</w>': 2, 'the</w>': 2, 'ma in</w>': 1, 'al g or i th ms</w>': 1, 'used</w>': 2, 'in</w>': 3, 'n at u r al language</w>': 1, 'pr o c e ss ing,</w>': 1, 'us e</w>': 2, 'a </w>': 1, 'd i ve r s e</w>': 1, 'r ang e</w>': 1, 'a p p l ic ation s</w>': 1, 'in cl u ding</w>': 1, 't e x t</w>': 1, 'cl a ss i f ic ation ,</w>': 1, 'ma c h in e</w>': 1, 't r an s l ation ,</w>': 1, 'and</w>': 3, 'q u e st ion </w>': 1, 'an s w e r ing .</w>': 1, 'T op ic s</w>': 1, 'b e</w>': 1, 'c o ve r ed</w>': 1, 'in cl u d e</w>': 1, 'p ar t - of - s p e ec h </w>': 1, 't ag g ing,</w>': 1, 'n - g r a m </w>': 1, 'language</w>': 2, 'm o de l l ing,</w>': 1, 's y nt a c t ic </w>': 1, 'p ar s ing</w>': 1, 'de e p </w>': 1, 'l e ar n ing .</w>': 1, 'pr o g r a m m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h op s ,</w>': 1, 'a ss i g n m e nts</w>': 1, 'in st al l ation </w>': 1, 'at </w>': 1, 'h o m e .</w>': 1}\n",
      "Iter: 54\n",
      "Best pair: ('pr', 'o')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 1, 'h': 4, 'e': 10, '</w>': 12, 'a': 8, 'i': 6, 'm': 8, 's': 10, 'f': 1, 'o': 3, 'r': 9, 't': 6, 'u': 7, 'b': 2, 'j': 1, 'c': 4, 'd': 2, 'n': 5, 'v': 0, 'l': 7, 'p': 6, 'g': 5, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 0, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 5, 'e</w>': 9, 'an': 4, 's</w>': 3, 'ing': 2, 'or': 3, 'on': 2, 'at': 2, ',</w>': 4, 'd</w>': 0, 'ion': 1, 'for': 1, 'th': 3, 'de': 5, 'ation': 5, 'for</w>': 4, 'st': 4, 'ing</w>': 2, 'in</w>': 4, 'us': 2, 'ang': 1, 'ag': 1, 'ic': 4, 'is</w>': 3, 'nt': 1, 've': 3, 'op': 3, 'of': 1, 'al': 3, 'ed</w>': 1, 'lang': 0, 'langu': 0, 'languag': 0, 'language</w>': 3, 'ss': 3, 'ing,</w>': 3, 'cl': 3, 'and</w>': 3, '.</w>': 3, 'ar': 3, 'Th': 0, 'The</w>': 2, 'ms</w>': 2, 'ec': 2, 't</w>': 2, 'nts</w>': 2, 'to': 0, 'to</w>': 2, 'ding</w>': 2, 'of</w>': 2, 'the</w>': 2, 'ma': 2, 'used</w>': 2, 'pr': 0, 'pro': 2})\n",
      "Number of tokens: 86\n",
      "==========\n",
      "vocab,  {'The</w>': 2, 'a i ms</w>': 1, 'for</w>': 4, 'th is</w>': 1, 's u b j ec t</w>': 1, 'is</w>': 2, 'st u de nts</w>': 1, 'to</w>': 2, 'de ve l op </w>': 1, 'an </w>': 1, 'u n de r st an ding</w>': 1, 'of</w>': 2, 'the</w>': 2, 'ma in</w>': 1, 'al g or i th ms</w>': 1, 'used</w>': 2, 'in</w>': 3, 'n at u r al language</w>': 1, 'pro c e ss ing,</w>': 1, 'us e</w>': 2, 'a </w>': 1, 'd i ve r s e</w>': 1, 'r ang e</w>': 1, 'a p p l ic ation s</w>': 1, 'in cl u ding</w>': 1, 't e x t</w>': 1, 'cl a ss i f ic ation ,</w>': 1, 'ma c h in e</w>': 1, 't r an s l ation ,</w>': 1, 'and</w>': 3, 'q u e st ion </w>': 1, 'an s w e r ing .</w>': 1, 'T op ic s</w>': 1, 'b e</w>': 1, 'c o ve r ed</w>': 1, 'in cl u d e</w>': 1, 'p ar t - of - s p e ec h </w>': 1, 't ag g ing,</w>': 1, 'n - g r a m </w>': 1, 'language</w>': 2, 'm o de l l ing,</w>': 1, 's y nt a c t ic </w>': 1, 'p ar s ing</w>': 1, 'de e p </w>': 1, 'l e ar n ing .</w>': 1, 'pro g r a m m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h op s ,</w>': 1, 'a ss i g n m e nts</w>': 1, 'in st al l ation </w>': 1, 'at </w>': 1, 'h o m e .</w>': 1}\n",
      "Iter: 55\n",
      "Best pair: ('us', 'e</w>')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 1, 'h': 4, 'e': 10, '</w>': 12, 'a': 8, 'i': 6, 'm': 8, 's': 10, 'f': 1, 'o': 3, 'r': 9, 't': 6, 'u': 7, 'b': 2, 'j': 1, 'c': 4, 'd': 2, 'n': 5, 'v': 0, 'l': 7, 'p': 6, 'g': 5, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 0, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 5, 'e</w>': 7, 'an': 4, 's</w>': 3, 'ing': 2, 'or': 3, 'on': 2, 'at': 2, ',</w>': 4, 'd</w>': 0, 'ion': 1, 'for': 1, 'th': 3, 'de': 5, 'ation': 5, 'for</w>': 4, 'st': 4, 'ing</w>': 2, 'in</w>': 4, 'us': 0, 'ang': 1, 'ag': 1, 'ic': 4, 'is</w>': 3, 'nt': 1, 've': 3, 'op': 3, 'of': 1, 'al': 3, 'ed</w>': 1, 'lang': 0, 'langu': 0, 'languag': 0, 'language</w>': 3, 'ss': 3, 'ing,</w>': 3, 'cl': 3, 'and</w>': 3, '.</w>': 3, 'ar': 3, 'Th': 0, 'The</w>': 2, 'ms</w>': 2, 'ec': 2, 't</w>': 2, 'nts</w>': 2, 'to': 0, 'to</w>': 2, 'ding</w>': 2, 'of</w>': 2, 'the</w>': 2, 'ma': 2, 'used</w>': 2, 'pr': 0, 'pro': 2, 'use</w>': 2})\n",
      "Number of tokens: 87\n",
      "==========\n",
      "vocab,  {'The</w>': 2, 'a i ms</w>': 1, 'for</w>': 4, 'th is</w>': 1, 's u b j ec t</w>': 1, 'is</w>': 2, 'st u de nts</w>': 1, 'to</w>': 2, 'de ve l op </w>': 1, 'an </w>': 1, 'u n de r st an ding</w>': 1, 'of</w>': 2, 'the</w>': 2, 'ma in</w>': 1, 'al g or i th ms</w>': 1, 'used</w>': 2, 'in</w>': 3, 'n at u r al language</w>': 1, 'pro c e ss ing,</w>': 1, 'use</w>': 2, 'a </w>': 1, 'd i ve r s e</w>': 1, 'r ang e</w>': 1, 'a p p l ic ation s</w>': 1, 'in cl u ding</w>': 1, 't e x t</w>': 1, 'cl a ss i f ic ation ,</w>': 1, 'ma c h in e</w>': 1, 't r an s l ation ,</w>': 1, 'and</w>': 3, 'q u e st ion </w>': 1, 'an s w e r ing .</w>': 1, 'T op ic s</w>': 1, 'b e</w>': 1, 'c o ve r ed</w>': 1, 'in cl u d e</w>': 1, 'p ar t - of - s p e ec h </w>': 1, 't ag g ing,</w>': 1, 'n - g r a m </w>': 1, 'language</w>': 2, 'm o de l l ing,</w>': 1, 's y nt a c t ic </w>': 1, 'p ar s ing</w>': 1, 'de e p </w>': 1, 'l e ar n ing .</w>': 1, 'pro g r a m m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h op s ,</w>': 1, 'a ss i g n m e nts</w>': 1, 'in st al l ation </w>': 1, 'at </w>': 1, 'h o m e .</w>': 1}\n",
      "Iter: 56\n",
      "Best pair: ('ve', 'r')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 1, 'h': 4, 'e': 10, '</w>': 12, 'a': 8, 'i': 6, 'm': 8, 's': 10, 'f': 1, 'o': 3, 'r': 7, 't': 6, 'u': 7, 'b': 2, 'j': 1, 'c': 4, 'd': 2, 'n': 5, 'v': 0, 'l': 7, 'p': 6, 'g': 5, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 0, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 5, 'e</w>': 7, 'an': 4, 's</w>': 3, 'ing': 2, 'or': 3, 'on': 2, 'at': 2, ',</w>': 4, 'd</w>': 0, 'ion': 1, 'for': 1, 'th': 3, 'de': 5, 'ation': 5, 'for</w>': 4, 'st': 4, 'ing</w>': 2, 'in</w>': 4, 'us': 0, 'ang': 1, 'ag': 1, 'ic': 4, 'is</w>': 3, 'nt': 1, 've': 1, 'op': 3, 'of': 1, 'al': 3, 'ed</w>': 1, 'lang': 0, 'langu': 0, 'languag': 0, 'language</w>': 3, 'ss': 3, 'ing,</w>': 3, 'cl': 3, 'and</w>': 3, '.</w>': 3, 'ar': 3, 'Th': 0, 'The</w>': 2, 'ms</w>': 2, 'ec': 2, 't</w>': 2, 'nts</w>': 2, 'to': 0, 'to</w>': 2, 'ding</w>': 2, 'of</w>': 2, 'the</w>': 2, 'ma': 2, 'used</w>': 2, 'pr': 0, 'pro': 2, 'use</w>': 2, 'ver': 2})\n",
      "Number of tokens: 88\n",
      "==========\n",
      "vocab,  {'The</w>': 2, 'a i ms</w>': 1, 'for</w>': 4, 'th is</w>': 1, 's u b j ec t</w>': 1, 'is</w>': 2, 'st u de nts</w>': 1, 'to</w>': 2, 'de ve l op </w>': 1, 'an </w>': 1, 'u n de r st an ding</w>': 1, 'of</w>': 2, 'the</w>': 2, 'ma in</w>': 1, 'al g or i th ms</w>': 1, 'used</w>': 2, 'in</w>': 3, 'n at u r al language</w>': 1, 'pro c e ss ing,</w>': 1, 'use</w>': 2, 'a </w>': 1, 'd i ver s e</w>': 1, 'r ang e</w>': 1, 'a p p l ic ation s</w>': 1, 'in cl u ding</w>': 1, 't e x t</w>': 1, 'cl a ss i f ic ation ,</w>': 1, 'ma c h in e</w>': 1, 't r an s l ation ,</w>': 1, 'and</w>': 3, 'q u e st ion </w>': 1, 'an s w e r ing .</w>': 1, 'T op ic s</w>': 1, 'b e</w>': 1, 'c o ver ed</w>': 1, 'in cl u d e</w>': 1, 'p ar t - of - s p e ec h </w>': 1, 't ag g ing,</w>': 1, 'n - g r a m </w>': 1, 'language</w>': 2, 'm o de l l ing,</w>': 1, 's y nt a c t ic </w>': 1, 'p ar s ing</w>': 1, 'de e p </w>': 1, 'l e ar n ing .</w>': 1, 'pro g r a m m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h op s ,</w>': 1, 'a ss i g n m e nts</w>': 1, 'in st al l ation </w>': 1, 'at </w>': 1, 'h o m e .</w>': 1}\n",
      "Iter: 57\n",
      "Best pair: ('ic', 'ation')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 1, 'h': 4, 'e': 10, '</w>': 12, 'a': 8, 'i': 6, 'm': 8, 's': 10, 'f': 1, 'o': 3, 'r': 7, 't': 6, 'u': 7, 'b': 2, 'j': 1, 'c': 4, 'd': 2, 'n': 5, 'v': 0, 'l': 7, 'p': 6, 'g': 5, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 0, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 5, 'e</w>': 7, 'an': 4, 's</w>': 3, 'ing': 2, 'or': 3, 'on': 2, 'at': 2, ',</w>': 4, 'd</w>': 0, 'ion': 1, 'for': 1, 'th': 3, 'de': 5, 'ation': 3, 'for</w>': 4, 'st': 4, 'ing</w>': 2, 'in</w>': 4, 'us': 0, 'ang': 1, 'ag': 1, 'ic': 2, 'is</w>': 3, 'nt': 1, 've': 1, 'op': 3, 'of': 1, 'al': 3, 'ed</w>': 1, 'lang': 0, 'langu': 0, 'languag': 0, 'language</w>': 3, 'ss': 3, 'ing,</w>': 3, 'cl': 3, 'and</w>': 3, '.</w>': 3, 'ar': 3, 'Th': 0, 'The</w>': 2, 'ms</w>': 2, 'ec': 2, 't</w>': 2, 'nts</w>': 2, 'to': 0, 'to</w>': 2, 'ding</w>': 2, 'of</w>': 2, 'the</w>': 2, 'ma': 2, 'used</w>': 2, 'pr': 0, 'pro': 2, 'use</w>': 2, 'ver': 2, 'ication': 2})\n",
      "Number of tokens: 89\n",
      "==========\n",
      "vocab,  {'The</w>': 2, 'a i ms</w>': 1, 'for</w>': 4, 'th is</w>': 1, 's u b j ec t</w>': 1, 'is</w>': 2, 'st u de nts</w>': 1, 'to</w>': 2, 'de ve l op </w>': 1, 'an </w>': 1, 'u n de r st an ding</w>': 1, 'of</w>': 2, 'the</w>': 2, 'ma in</w>': 1, 'al g or i th ms</w>': 1, 'used</w>': 2, 'in</w>': 3, 'n at u r al language</w>': 1, 'pro c e ss ing,</w>': 1, 'use</w>': 2, 'a </w>': 1, 'd i ver s e</w>': 1, 'r ang e</w>': 1, 'a p p l ication s</w>': 1, 'in cl u ding</w>': 1, 't e x t</w>': 1, 'cl a ss i f ication ,</w>': 1, 'ma c h in e</w>': 1, 't r an s l ation ,</w>': 1, 'and</w>': 3, 'q u e st ion </w>': 1, 'an s w e r ing .</w>': 1, 'T op ic s</w>': 1, 'b e</w>': 1, 'c o ver ed</w>': 1, 'in cl u d e</w>': 1, 'p ar t - of - s p e ec h </w>': 1, 't ag g ing,</w>': 1, 'n - g r a m </w>': 1, 'language</w>': 2, 'm o de l l ing,</w>': 1, 's y nt a c t ic </w>': 1, 'p ar s ing</w>': 1, 'de e p </w>': 1, 'l e ar n ing .</w>': 1, 'pro g r a m m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h op s ,</w>': 1, 'a ss i g n m e nts</w>': 1, 'in st al l ation </w>': 1, 'at </w>': 1, 'h o m e .</w>': 1}\n",
      "Iter: 58\n",
      "Best pair: ('in', 'cl')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 1, 'h': 4, 'e': 10, '</w>': 12, 'a': 8, 'i': 6, 'm': 8, 's': 10, 'f': 1, 'o': 3, 'r': 7, 't': 6, 'u': 7, 'b': 2, 'j': 1, 'c': 4, 'd': 2, 'n': 5, 'v': 0, 'l': 7, 'p': 6, 'g': 5, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 0, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 3, 'e</w>': 7, 'an': 4, 's</w>': 3, 'ing': 2, 'or': 3, 'on': 2, 'at': 2, ',</w>': 4, 'd</w>': 0, 'ion': 1, 'for': 1, 'th': 3, 'de': 5, 'ation': 3, 'for</w>': 4, 'st': 4, 'ing</w>': 2, 'in</w>': 4, 'us': 0, 'ang': 1, 'ag': 1, 'ic': 2, 'is</w>': 3, 'nt': 1, 've': 1, 'op': 3, 'of': 1, 'al': 3, 'ed</w>': 1, 'lang': 0, 'langu': 0, 'languag': 0, 'language</w>': 3, 'ss': 3, 'ing,</w>': 3, 'cl': 1, 'and</w>': 3, '.</w>': 3, 'ar': 3, 'Th': 0, 'The</w>': 2, 'ms</w>': 2, 'ec': 2, 't</w>': 2, 'nts</w>': 2, 'to': 0, 'to</w>': 2, 'ding</w>': 2, 'of</w>': 2, 'the</w>': 2, 'ma': 2, 'used</w>': 2, 'pr': 0, 'pro': 2, 'use</w>': 2, 'ver': 2, 'ication': 2, 'incl': 2})\n",
      "Number of tokens: 90\n",
      "==========\n",
      "vocab,  {'The</w>': 2, 'a i ms</w>': 1, 'for</w>': 4, 'th is</w>': 1, 's u b j ec t</w>': 1, 'is</w>': 2, 'st u de nts</w>': 1, 'to</w>': 2, 'de ve l op </w>': 1, 'an </w>': 1, 'u n de r st an ding</w>': 1, 'of</w>': 2, 'the</w>': 2, 'ma in</w>': 1, 'al g or i th ms</w>': 1, 'used</w>': 2, 'in</w>': 3, 'n at u r al language</w>': 1, 'pro c e ss ing,</w>': 1, 'use</w>': 2, 'a </w>': 1, 'd i ver s e</w>': 1, 'r ang e</w>': 1, 'a p p l ication s</w>': 1, 'incl u ding</w>': 1, 't e x t</w>': 1, 'cl a ss i f ication ,</w>': 1, 'ma c h in e</w>': 1, 't r an s l ation ,</w>': 1, 'and</w>': 3, 'q u e st ion </w>': 1, 'an s w e r ing .</w>': 1, 'T op ic s</w>': 1, 'b e</w>': 1, 'c o ver ed</w>': 1, 'incl u d e</w>': 1, 'p ar t - of - s p e ec h </w>': 1, 't ag g ing,</w>': 1, 'n - g r a m </w>': 1, 'language</w>': 2, 'm o de l l ing,</w>': 1, 's y nt a c t ic </w>': 1, 'p ar s ing</w>': 1, 'de e p </w>': 1, 'l e ar n ing .</w>': 1, 'pro g r a m m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h op s ,</w>': 1, 'a ss i g n m e nts</w>': 1, 'in st al l ation </w>': 1, 'at </w>': 1, 'h o m e .</w>': 1}\n",
      "Iter: 59\n",
      "Best pair: ('incl', 'u')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 1, 'h': 4, 'e': 10, '</w>': 12, 'a': 8, 'i': 6, 'm': 8, 's': 10, 'f': 1, 'o': 3, 'r': 7, 't': 6, 'u': 5, 'b': 2, 'j': 1, 'c': 4, 'd': 2, 'n': 5, 'v': 0, 'l': 7, 'p': 6, 'g': 5, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 0, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 3, 'e</w>': 7, 'an': 4, 's</w>': 3, 'ing': 2, 'or': 3, 'on': 2, 'at': 2, ',</w>': 4, 'd</w>': 0, 'ion': 1, 'for': 1, 'th': 3, 'de': 5, 'ation': 3, 'for</w>': 4, 'st': 4, 'ing</w>': 2, 'in</w>': 4, 'us': 0, 'ang': 1, 'ag': 1, 'ic': 2, 'is</w>': 3, 'nt': 1, 've': 1, 'op': 3, 'of': 1, 'al': 3, 'ed</w>': 1, 'lang': 0, 'langu': 0, 'languag': 0, 'language</w>': 3, 'ss': 3, 'ing,</w>': 3, 'cl': 1, 'and</w>': 3, '.</w>': 3, 'ar': 3, 'Th': 0, 'The</w>': 2, 'ms</w>': 2, 'ec': 2, 't</w>': 2, 'nts</w>': 2, 'to': 0, 'to</w>': 2, 'ding</w>': 2, 'of</w>': 2, 'the</w>': 2, 'ma': 2, 'used</w>': 2, 'pr': 0, 'pro': 2, 'use</w>': 2, 'ver': 2, 'ication': 2, 'incl': 0, 'inclu': 2})\n",
      "Number of tokens: 91\n",
      "==========\n",
      "vocab,  {'The</w>': 2, 'a i ms</w>': 1, 'for</w>': 4, 'th is</w>': 1, 's u b j ec t</w>': 1, 'is</w>': 2, 'st u de nts</w>': 1, 'to</w>': 2, 'de ve l op </w>': 1, 'an </w>': 1, 'u n de r st an ding</w>': 1, 'of</w>': 2, 'the</w>': 2, 'ma in</w>': 1, 'al g or i th ms</w>': 1, 'used</w>': 2, 'in</w>': 3, 'n at u r al language</w>': 1, 'pro c e ss ing,</w>': 1, 'use</w>': 2, 'a </w>': 1, 'd i ver s e</w>': 1, 'r ang e</w>': 1, 'a p p l ication s</w>': 1, 'inclu ding</w>': 1, 't e x t</w>': 1, 'cl a ss i f ication ,</w>': 1, 'ma c h in e</w>': 1, 't r an s l ation ,</w>': 1, 'and</w>': 3, 'q u e st ion </w>': 1, 'an s w e r ing .</w>': 1, 'T op ic s</w>': 1, 'b e</w>': 1, 'c o ver ed</w>': 1, 'inclu d e</w>': 1, 'p ar t - of - s p e ec h </w>': 1, 't ag g ing,</w>': 1, 'n - g r a m </w>': 1, 'language</w>': 2, 'm o de l l ing,</w>': 1, 's y nt a c t ic </w>': 1, 'p ar s ing</w>': 1, 'de e p </w>': 1, 'l e ar n ing .</w>': 1, 'pro g r a m m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h op s ,</w>': 1, 'a ss i g n m e nts</w>': 1, 'in st al l ation </w>': 1, 'at </w>': 1, 'h o m e .</w>': 1}\n",
      "Iter: 60\n",
      "Best pair: ('a', 'ss')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 1, 'h': 4, 'e': 10, '</w>': 12, 'a': 6, 'i': 6, 'm': 8, 's': 10, 'f': 1, 'o': 3, 'r': 7, 't': 6, 'u': 5, 'b': 2, 'j': 1, 'c': 4, 'd': 2, 'n': 5, 'v': 0, 'l': 7, 'p': 6, 'g': 5, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 0, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 3, 'e</w>': 7, 'an': 4, 's</w>': 3, 'ing': 2, 'or': 3, 'on': 2, 'at': 2, ',</w>': 4, 'd</w>': 0, 'ion': 1, 'for': 1, 'th': 3, 'de': 5, 'ation': 3, 'for</w>': 4, 'st': 4, 'ing</w>': 2, 'in</w>': 4, 'us': 0, 'ang': 1, 'ag': 1, 'ic': 2, 'is</w>': 3, 'nt': 1, 've': 1, 'op': 3, 'of': 1, 'al': 3, 'ed</w>': 1, 'lang': 0, 'langu': 0, 'languag': 0, 'language</w>': 3, 'ss': 1, 'ing,</w>': 3, 'cl': 1, 'and</w>': 3, '.</w>': 3, 'ar': 3, 'Th': 0, 'The</w>': 2, 'ms</w>': 2, 'ec': 2, 't</w>': 2, 'nts</w>': 2, 'to': 0, 'to</w>': 2, 'ding</w>': 2, 'of</w>': 2, 'the</w>': 2, 'ma': 2, 'used</w>': 2, 'pr': 0, 'pro': 2, 'use</w>': 2, 'ver': 2, 'ication': 2, 'incl': 0, 'inclu': 2, 'ass': 2})\n",
      "Number of tokens: 92\n",
      "==========\n",
      "vocab,  {'The</w>': 2, 'a i ms</w>': 1, 'for</w>': 4, 'th is</w>': 1, 's u b j ec t</w>': 1, 'is</w>': 2, 'st u de nts</w>': 1, 'to</w>': 2, 'de ve l op </w>': 1, 'an </w>': 1, 'u n de r st an ding</w>': 1, 'of</w>': 2, 'the</w>': 2, 'ma in</w>': 1, 'al g or i th ms</w>': 1, 'used</w>': 2, 'in</w>': 3, 'n at u r al language</w>': 1, 'pro c e ss ing,</w>': 1, 'use</w>': 2, 'a </w>': 1, 'd i ver s e</w>': 1, 'r ang e</w>': 1, 'a p p l ication s</w>': 1, 'inclu ding</w>': 1, 't e x t</w>': 1, 'cl ass i f ication ,</w>': 1, 'ma c h in e</w>': 1, 't r an s l ation ,</w>': 1, 'and</w>': 3, 'q u e st ion </w>': 1, 'an s w e r ing .</w>': 1, 'T op ic s</w>': 1, 'b e</w>': 1, 'c o ver ed</w>': 1, 'inclu d e</w>': 1, 'p ar t - of - s p e ec h </w>': 1, 't ag g ing,</w>': 1, 'n - g r a m </w>': 1, 'language</w>': 2, 'm o de l l ing,</w>': 1, 's y nt a c t ic </w>': 1, 'p ar s ing</w>': 1, 'de e p </w>': 1, 'l e ar n ing .</w>': 1, 'pro g r a m m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h op s ,</w>': 1, 'ass i g n m e nts</w>': 1, 'in st al l ation </w>': 1, 'at </w>': 1, 'h o m e .</w>': 1}\n",
      "Iter: 61\n",
      "Best pair: ('ass', 'i')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 1, 'h': 4, 'e': 10, '</w>': 12, 'a': 6, 'i': 4, 'm': 8, 's': 10, 'f': 1, 'o': 3, 'r': 7, 't': 6, 'u': 5, 'b': 2, 'j': 1, 'c': 4, 'd': 2, 'n': 5, 'v': 0, 'l': 7, 'p': 6, 'g': 5, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 0, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 3, 'e</w>': 7, 'an': 4, 's</w>': 3, 'ing': 2, 'or': 3, 'on': 2, 'at': 2, ',</w>': 4, 'd</w>': 0, 'ion': 1, 'for': 1, 'th': 3, 'de': 5, 'ation': 3, 'for</w>': 4, 'st': 4, 'ing</w>': 2, 'in</w>': 4, 'us': 0, 'ang': 1, 'ag': 1, 'ic': 2, 'is</w>': 3, 'nt': 1, 've': 1, 'op': 3, 'of': 1, 'al': 3, 'ed</w>': 1, 'lang': 0, 'langu': 0, 'languag': 0, 'language</w>': 3, 'ss': 1, 'ing,</w>': 3, 'cl': 1, 'and</w>': 3, '.</w>': 3, 'ar': 3, 'Th': 0, 'The</w>': 2, 'ms</w>': 2, 'ec': 2, 't</w>': 2, 'nts</w>': 2, 'to': 0, 'to</w>': 2, 'ding</w>': 2, 'of</w>': 2, 'the</w>': 2, 'ma': 2, 'used</w>': 2, 'pr': 0, 'pro': 2, 'use</w>': 2, 'ver': 2, 'ication': 2, 'incl': 0, 'inclu': 2, 'ass': 0, 'assi': 2})\n",
      "Number of tokens: 93\n",
      "==========\n",
      "vocab,  {'The</w>': 2, 'a i ms</w>': 1, 'for</w>': 4, 'th is</w>': 1, 's u b j ec t</w>': 1, 'is</w>': 2, 'st u de nts</w>': 1, 'to</w>': 2, 'de ve l op </w>': 1, 'an </w>': 1, 'u n de r st an ding</w>': 1, 'of</w>': 2, 'the</w>': 2, 'ma in</w>': 1, 'al g or i th ms</w>': 1, 'used</w>': 2, 'in</w>': 3, 'n at u r al language</w>': 1, 'pro c e ss ing,</w>': 1, 'use</w>': 2, 'a </w>': 1, 'd i ver s e</w>': 1, 'r ang e</w>': 1, 'a p p l ication s</w>': 1, 'inclu ding</w>': 1, 't e x t</w>': 1, 'cl assi f ication ,</w>': 1, 'ma c h in e</w>': 1, 't r an s l ation ,</w>': 1, 'and</w>': 3, 'q u e st ion </w>': 1, 'an s w e r ing .</w>': 1, 'T op ic s</w>': 1, 'b e</w>': 1, 'c o ver ed</w>': 1, 'inclu d e</w>': 1, 'p ar t - of - s p e ec h </w>': 1, 't ag g ing,</w>': 1, 'n - g r a m </w>': 1, 'language</w>': 2, 'm o de l l ing,</w>': 1, 's y nt a c t ic </w>': 1, 'p ar s ing</w>': 1, 'de e p </w>': 1, 'l e ar n ing .</w>': 1, 'pro g r a m m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h op s ,</w>': 1, 'assi g n m e nts</w>': 1, 'in st al l ation </w>': 1, 'at </w>': 1, 'h o m e .</w>': 1}\n",
      "Iter: 62\n",
      "Best pair: ('an', 's')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 1, 'h': 4, 'e': 10, '</w>': 12, 'a': 6, 'i': 4, 'm': 8, 's': 8, 'f': 1, 'o': 3, 'r': 7, 't': 6, 'u': 5, 'b': 2, 'j': 1, 'c': 4, 'd': 2, 'n': 5, 'v': 0, 'l': 7, 'p': 6, 'g': 5, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 0, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 3, 'e</w>': 7, 'an': 2, 's</w>': 3, 'ing': 2, 'or': 3, 'on': 2, 'at': 2, ',</w>': 4, 'd</w>': 0, 'ion': 1, 'for': 1, 'th': 3, 'de': 5, 'ation': 3, 'for</w>': 4, 'st': 4, 'ing</w>': 2, 'in</w>': 4, 'us': 0, 'ang': 1, 'ag': 1, 'ic': 2, 'is</w>': 3, 'nt': 1, 've': 1, 'op': 3, 'of': 1, 'al': 3, 'ed</w>': 1, 'lang': 0, 'langu': 0, 'languag': 0, 'language</w>': 3, 'ss': 1, 'ing,</w>': 3, 'cl': 1, 'and</w>': 3, '.</w>': 3, 'ar': 3, 'Th': 0, 'The</w>': 2, 'ms</w>': 2, 'ec': 2, 't</w>': 2, 'nts</w>': 2, 'to': 0, 'to</w>': 2, 'ding</w>': 2, 'of</w>': 2, 'the</w>': 2, 'ma': 2, 'used</w>': 2, 'pr': 0, 'pro': 2, 'use</w>': 2, 'ver': 2, 'ication': 2, 'incl': 0, 'inclu': 2, 'ass': 0, 'assi': 2, 'ans': 2})\n",
      "Number of tokens: 94\n",
      "==========\n",
      "vocab,  {'The</w>': 2, 'a i ms</w>': 1, 'for</w>': 4, 'th is</w>': 1, 's u b j ec t</w>': 1, 'is</w>': 2, 'st u de nts</w>': 1, 'to</w>': 2, 'de ve l op </w>': 1, 'an </w>': 1, 'u n de r st an ding</w>': 1, 'of</w>': 2, 'the</w>': 2, 'ma in</w>': 1, 'al g or i th ms</w>': 1, 'used</w>': 2, 'in</w>': 3, 'n at u r al language</w>': 1, 'pro c e ss ing,</w>': 1, 'use</w>': 2, 'a </w>': 1, 'd i ver s e</w>': 1, 'r ang e</w>': 1, 'a p p l ication s</w>': 1, 'inclu ding</w>': 1, 't e x t</w>': 1, 'cl assi f ication ,</w>': 1, 'ma c h in e</w>': 1, 't r ans l ation ,</w>': 1, 'and</w>': 3, 'q u e st ion </w>': 1, 'ans w e r ing .</w>': 1, 'T op ic s</w>': 1, 'b e</w>': 1, 'c o ver ed</w>': 1, 'inclu d e</w>': 1, 'p ar t - of - s p e ec h </w>': 1, 't ag g ing,</w>': 1, 'n - g r a m </w>': 1, 'language</w>': 2, 'm o de l l ing,</w>': 1, 's y nt a c t ic </w>': 1, 'p ar s ing</w>': 1, 'de e p </w>': 1, 'l e ar n ing .</w>': 1, 'pro g r a m m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h op s ,</w>': 1, 'assi g n m e nts</w>': 1, 'in st al l ation </w>': 1, 'at </w>': 1, 'h o m e .</w>': 1}\n",
      "Iter: 63\n",
      "Best pair: ('l', 'ation')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 1, 'h': 4, 'e': 10, '</w>': 12, 'a': 6, 'i': 4, 'm': 8, 's': 8, 'f': 1, 'o': 3, 'r': 7, 't': 6, 'u': 5, 'b': 2, 'j': 1, 'c': 4, 'd': 2, 'n': 5, 'v': 0, 'l': 5, 'p': 6, 'g': 5, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 0, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 3, 'e</w>': 7, 'an': 2, 's</w>': 3, 'ing': 2, 'or': 3, 'on': 2, 'at': 2, ',</w>': 4, 'd</w>': 0, 'ion': 1, 'for': 1, 'th': 3, 'de': 5, 'ation': 1, 'for</w>': 4, 'st': 4, 'ing</w>': 2, 'in</w>': 4, 'us': 0, 'ang': 1, 'ag': 1, 'ic': 2, 'is</w>': 3, 'nt': 1, 've': 1, 'op': 3, 'of': 1, 'al': 3, 'ed</w>': 1, 'lang': 0, 'langu': 0, 'languag': 0, 'language</w>': 3, 'ss': 1, 'ing,</w>': 3, 'cl': 1, 'and</w>': 3, '.</w>': 3, 'ar': 3, 'Th': 0, 'The</w>': 2, 'ms</w>': 2, 'ec': 2, 't</w>': 2, 'nts</w>': 2, 'to': 0, 'to</w>': 2, 'ding</w>': 2, 'of</w>': 2, 'the</w>': 2, 'ma': 2, 'used</w>': 2, 'pr': 0, 'pro': 2, 'use</w>': 2, 'ver': 2, 'ication': 2, 'incl': 0, 'inclu': 2, 'ass': 0, 'assi': 2, 'ans': 2, 'lation': 2})\n",
      "Number of tokens: 95\n",
      "==========\n",
      "vocab,  {'The</w>': 2, 'a i ms</w>': 1, 'for</w>': 4, 'th is</w>': 1, 's u b j ec t</w>': 1, 'is</w>': 2, 'st u de nts</w>': 1, 'to</w>': 2, 'de ve l op </w>': 1, 'an </w>': 1, 'u n de r st an ding</w>': 1, 'of</w>': 2, 'the</w>': 2, 'ma in</w>': 1, 'al g or i th ms</w>': 1, 'used</w>': 2, 'in</w>': 3, 'n at u r al language</w>': 1, 'pro c e ss ing,</w>': 1, 'use</w>': 2, 'a </w>': 1, 'd i ver s e</w>': 1, 'r ang e</w>': 1, 'a p p l ication s</w>': 1, 'inclu ding</w>': 1, 't e x t</w>': 1, 'cl assi f ication ,</w>': 1, 'ma c h in e</w>': 1, 't r ans lation ,</w>': 1, 'and</w>': 3, 'q u e st ion </w>': 1, 'ans w e r ing .</w>': 1, 'T op ic s</w>': 1, 'b e</w>': 1, 'c o ver ed</w>': 1, 'inclu d e</w>': 1, 'p ar t - of - s p e ec h </w>': 1, 't ag g ing,</w>': 1, 'n - g r a m </w>': 1, 'language</w>': 2, 'm o de l l ing,</w>': 1, 's y nt a c t ic </w>': 1, 'p ar s ing</w>': 1, 'de e p </w>': 1, 'l e ar n ing .</w>': 1, 'pro g r a m m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h op s ,</w>': 1, 'assi g n m e nts</w>': 1, 'in st al lation </w>': 1, 'at </w>': 1, 'h o m e .</w>': 1}\n",
      "Iter: 64\n",
      "Best pair: ('ing', '.</w>')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 1, 'h': 4, 'e': 10, '</w>': 12, 'a': 6, 'i': 4, 'm': 8, 's': 8, 'f': 1, 'o': 3, 'r': 7, 't': 6, 'u': 5, 'b': 2, 'j': 1, 'c': 4, 'd': 2, 'n': 5, 'v': 0, 'l': 5, 'p': 6, 'g': 5, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 0, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 3, 'e</w>': 7, 'an': 2, 's</w>': 3, 'ing': 0, 'or': 3, 'on': 2, 'at': 2, ',</w>': 4, 'd</w>': 0, 'ion': 1, 'for': 1, 'th': 3, 'de': 5, 'ation': 1, 'for</w>': 4, 'st': 4, 'ing</w>': 2, 'in</w>': 4, 'us': 0, 'ang': 1, 'ag': 1, 'ic': 2, 'is</w>': 3, 'nt': 1, 've': 1, 'op': 3, 'of': 1, 'al': 3, 'ed</w>': 1, 'lang': 0, 'langu': 0, 'languag': 0, 'language</w>': 3, 'ss': 1, 'ing,</w>': 3, 'cl': 1, 'and</w>': 3, '.</w>': 1, 'ar': 3, 'Th': 0, 'The</w>': 2, 'ms</w>': 2, 'ec': 2, 't</w>': 2, 'nts</w>': 2, 'to': 0, 'to</w>': 2, 'ding</w>': 2, 'of</w>': 2, 'the</w>': 2, 'ma': 2, 'used</w>': 2, 'pr': 0, 'pro': 2, 'use</w>': 2, 'ver': 2, 'ication': 2, 'incl': 0, 'inclu': 2, 'ass': 0, 'assi': 2, 'ans': 2, 'lation': 2, 'ing.</w>': 2})\n",
      "Number of tokens: 96\n",
      "==========\n",
      "vocab,  {'The</w>': 2, 'a i ms</w>': 1, 'for</w>': 4, 'th is</w>': 1, 's u b j ec t</w>': 1, 'is</w>': 2, 'st u de nts</w>': 1, 'to</w>': 2, 'de ve l op </w>': 1, 'an </w>': 1, 'u n de r st an ding</w>': 1, 'of</w>': 2, 'the</w>': 2, 'ma in</w>': 1, 'al g or i th ms</w>': 1, 'used</w>': 2, 'in</w>': 3, 'n at u r al language</w>': 1, 'pro c e ss ing,</w>': 1, 'use</w>': 2, 'a </w>': 1, 'd i ver s e</w>': 1, 'r ang e</w>': 1, 'a p p l ication s</w>': 1, 'inclu ding</w>': 1, 't e x t</w>': 1, 'cl assi f ication ,</w>': 1, 'ma c h in e</w>': 1, 't r ans lation ,</w>': 1, 'and</w>': 3, 'q u e st ion </w>': 1, 'ans w e r ing.</w>': 1, 'T op ic s</w>': 1, 'b e</w>': 1, 'c o ver ed</w>': 1, 'inclu d e</w>': 1, 'p ar t - of - s p e ec h </w>': 1, 't ag g ing,</w>': 1, 'n - g r a m </w>': 1, 'language</w>': 2, 'm o de l l ing,</w>': 1, 's y nt a c t ic </w>': 1, 'p ar s ing</w>': 1, 'de e p </w>': 1, 'l e ar n ing.</w>': 1, 'pro g r a m m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h op s ,</w>': 1, 'assi g n m e nts</w>': 1, 'in st al lation </w>': 1, 'at </w>': 1, 'h o m e .</w>': 1}\n",
      "Iter: 65\n",
      "Best pair: ('p', 'ar')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 1, 'h': 4, 'e': 10, '</w>': 12, 'a': 6, 'i': 4, 'm': 8, 's': 8, 'f': 1, 'o': 3, 'r': 7, 't': 6, 'u': 5, 'b': 2, 'j': 1, 'c': 4, 'd': 2, 'n': 5, 'v': 0, 'l': 5, 'p': 4, 'g': 5, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 0, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 3, 'e</w>': 7, 'an': 2, 's</w>': 3, 'ing': 0, 'or': 3, 'on': 2, 'at': 2, ',</w>': 4, 'd</w>': 0, 'ion': 1, 'for': 1, 'th': 3, 'de': 5, 'ation': 1, 'for</w>': 4, 'st': 4, 'ing</w>': 2, 'in</w>': 4, 'us': 0, 'ang': 1, 'ag': 1, 'ic': 2, 'is</w>': 3, 'nt': 1, 've': 1, 'op': 3, 'of': 1, 'al': 3, 'ed</w>': 1, 'lang': 0, 'langu': 0, 'languag': 0, 'language</w>': 3, 'ss': 1, 'ing,</w>': 3, 'cl': 1, 'and</w>': 3, '.</w>': 1, 'ar': 1, 'Th': 0, 'The</w>': 2, 'ms</w>': 2, 'ec': 2, 't</w>': 2, 'nts</w>': 2, 'to': 0, 'to</w>': 2, 'ding</w>': 2, 'of</w>': 2, 'the</w>': 2, 'ma': 2, 'used</w>': 2, 'pr': 0, 'pro': 2, 'use</w>': 2, 'ver': 2, 'ication': 2, 'incl': 0, 'inclu': 2, 'ass': 0, 'assi': 2, 'ans': 2, 'lation': 2, 'ing.</w>': 2, 'par': 2})\n",
      "Number of tokens: 97\n",
      "==========\n",
      "vocab,  {'The</w>': 2, 'a i ms</w>': 1, 'for</w>': 4, 'th is</w>': 1, 's u b j ec t</w>': 1, 'is</w>': 2, 'st u de nts</w>': 1, 'to</w>': 2, 'de ve l op </w>': 1, 'an </w>': 1, 'u n de r st an ding</w>': 1, 'of</w>': 2, 'the</w>': 2, 'ma in</w>': 1, 'al g or i th ms</w>': 1, 'used</w>': 2, 'in</w>': 3, 'n at u r al language</w>': 1, 'pro c e ss ing,</w>': 1, 'use</w>': 2, 'a </w>': 1, 'd i ver s e</w>': 1, 'r ang e</w>': 1, 'a p p l ication s</w>': 1, 'inclu ding</w>': 1, 't e x t</w>': 1, 'cl assi f ication ,</w>': 1, 'ma c h in e</w>': 1, 't r ans lation ,</w>': 1, 'and</w>': 3, 'q u e st ion </w>': 1, 'ans w e r ing.</w>': 1, 'T op ic s</w>': 1, 'b e</w>': 1, 'c o ver ed</w>': 1, 'inclu d e</w>': 1, 'par t - of - s p e ec h </w>': 1, 't ag g ing,</w>': 1, 'n - g r a m </w>': 1, 'language</w>': 2, 'm o de l l ing,</w>': 1, 's y nt a c t ic </w>': 1, 'par s ing</w>': 1, 'de e p </w>': 1, 'l e ar n ing.</w>': 1, 'pro g r a m m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h op s ,</w>': 1, 'assi g n m e nts</w>': 1, 'in st al lation </w>': 1, 'at </w>': 1, 'h o m e .</w>': 1}\n",
      "Iter: 66\n",
      "Best pair: ('g', 'r')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 1, 'h': 4, 'e': 10, '</w>': 12, 'a': 6, 'i': 4, 'm': 8, 's': 8, 'f': 1, 'o': 3, 'r': 5, 't': 6, 'u': 5, 'b': 2, 'j': 1, 'c': 4, 'd': 2, 'n': 5, 'v': 0, 'l': 5, 'p': 4, 'g': 3, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 0, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 3, 'e</w>': 7, 'an': 2, 's</w>': 3, 'ing': 0, 'or': 3, 'on': 2, 'at': 2, ',</w>': 4, 'd</w>': 0, 'ion': 1, 'for': 1, 'th': 3, 'de': 5, 'ation': 1, 'for</w>': 4, 'st': 4, 'ing</w>': 2, 'in</w>': 4, 'us': 0, 'ang': 1, 'ag': 1, 'ic': 2, 'is</w>': 3, 'nt': 1, 've': 1, 'op': 3, 'of': 1, 'al': 3, 'ed</w>': 1, 'lang': 0, 'langu': 0, 'languag': 0, 'language</w>': 3, 'ss': 1, 'ing,</w>': 3, 'cl': 1, 'and</w>': 3, '.</w>': 1, 'ar': 1, 'Th': 0, 'The</w>': 2, 'ms</w>': 2, 'ec': 2, 't</w>': 2, 'nts</w>': 2, 'to': 0, 'to</w>': 2, 'ding</w>': 2, 'of</w>': 2, 'the</w>': 2, 'ma': 2, 'used</w>': 2, 'pr': 0, 'pro': 2, 'use</w>': 2, 'ver': 2, 'ication': 2, 'incl': 0, 'inclu': 2, 'ass': 0, 'assi': 2, 'ans': 2, 'lation': 2, 'ing.</w>': 2, 'par': 2, 'gr': 2})\n",
      "Number of tokens: 98\n",
      "==========\n",
      "vocab,  {'The</w>': 2, 'a i ms</w>': 1, 'for</w>': 4, 'th is</w>': 1, 's u b j ec t</w>': 1, 'is</w>': 2, 'st u de nts</w>': 1, 'to</w>': 2, 'de ve l op </w>': 1, 'an </w>': 1, 'u n de r st an ding</w>': 1, 'of</w>': 2, 'the</w>': 2, 'ma in</w>': 1, 'al g or i th ms</w>': 1, 'used</w>': 2, 'in</w>': 3, 'n at u r al language</w>': 1, 'pro c e ss ing,</w>': 1, 'use</w>': 2, 'a </w>': 1, 'd i ver s e</w>': 1, 'r ang e</w>': 1, 'a p p l ication s</w>': 1, 'inclu ding</w>': 1, 't e x t</w>': 1, 'cl assi f ication ,</w>': 1, 'ma c h in e</w>': 1, 't r ans lation ,</w>': 1, 'and</w>': 3, 'q u e st ion </w>': 1, 'ans w e r ing.</w>': 1, 'T op ic s</w>': 1, 'b e</w>': 1, 'c o ver ed</w>': 1, 'inclu d e</w>': 1, 'par t - of - s p e ec h </w>': 1, 't ag g ing,</w>': 1, 'n - gr a m </w>': 1, 'language</w>': 2, 'm o de l l ing,</w>': 1, 's y nt a c t ic </w>': 1, 'par s ing</w>': 1, 'de e p </w>': 1, 'l e ar n ing.</w>': 1, 'pro gr a m m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h op s ,</w>': 1, 'assi g n m e nts</w>': 1, 'in st al lation </w>': 1, 'at </w>': 1, 'h o m e .</w>': 1}\n",
      "Iter: 67\n",
      "Best pair: ('gr', 'a')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 1, 'h': 4, 'e': 10, '</w>': 12, 'a': 4, 'i': 4, 'm': 8, 's': 8, 'f': 1, 'o': 3, 'r': 5, 't': 6, 'u': 5, 'b': 2, 'j': 1, 'c': 4, 'd': 2, 'n': 5, 'v': 0, 'l': 5, 'p': 4, 'g': 3, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 0, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 3, 'e</w>': 7, 'an': 2, 's</w>': 3, 'ing': 0, 'or': 3, 'on': 2, 'at': 2, ',</w>': 4, 'd</w>': 0, 'ion': 1, 'for': 1, 'th': 3, 'de': 5, 'ation': 1, 'for</w>': 4, 'st': 4, 'ing</w>': 2, 'in</w>': 4, 'us': 0, 'ang': 1, 'ag': 1, 'ic': 2, 'is</w>': 3, 'nt': 1, 've': 1, 'op': 3, 'of': 1, 'al': 3, 'ed</w>': 1, 'lang': 0, 'langu': 0, 'languag': 0, 'language</w>': 3, 'ss': 1, 'ing,</w>': 3, 'cl': 1, 'and</w>': 3, '.</w>': 1, 'ar': 1, 'Th': 0, 'The</w>': 2, 'ms</w>': 2, 'ec': 2, 't</w>': 2, 'nts</w>': 2, 'to': 0, 'to</w>': 2, 'ding</w>': 2, 'of</w>': 2, 'the</w>': 2, 'ma': 2, 'used</w>': 2, 'pr': 0, 'pro': 2, 'use</w>': 2, 'ver': 2, 'ication': 2, 'incl': 0, 'inclu': 2, 'ass': 0, 'assi': 2, 'ans': 2, 'lation': 2, 'ing.</w>': 2, 'par': 2, 'gr': 0, 'gra': 2})\n",
      "Number of tokens: 99\n",
      "==========\n",
      "vocab,  {'The</w>': 2, 'a i ms</w>': 1, 'for</w>': 4, 'th is</w>': 1, 's u b j ec t</w>': 1, 'is</w>': 2, 'st u de nts</w>': 1, 'to</w>': 2, 'de ve l op </w>': 1, 'an </w>': 1, 'u n de r st an ding</w>': 1, 'of</w>': 2, 'the</w>': 2, 'ma in</w>': 1, 'al g or i th ms</w>': 1, 'used</w>': 2, 'in</w>': 3, 'n at u r al language</w>': 1, 'pro c e ss ing,</w>': 1, 'use</w>': 2, 'a </w>': 1, 'd i ver s e</w>': 1, 'r ang e</w>': 1, 'a p p l ication s</w>': 1, 'inclu ding</w>': 1, 't e x t</w>': 1, 'cl assi f ication ,</w>': 1, 'ma c h in e</w>': 1, 't r ans lation ,</w>': 1, 'and</w>': 3, 'q u e st ion </w>': 1, 'ans w e r ing.</w>': 1, 'T op ic s</w>': 1, 'b e</w>': 1, 'c o ver ed</w>': 1, 'inclu d e</w>': 1, 'par t - of - s p e ec h </w>': 1, 't ag g ing,</w>': 1, 'n - gra m </w>': 1, 'language</w>': 2, 'm o de l l ing,</w>': 1, 's y nt a c t ic </w>': 1, 'par s ing</w>': 1, 'de e p </w>': 1, 'l e ar n ing.</w>': 1, 'pro gra m m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h op s ,</w>': 1, 'assi g n m e nts</w>': 1, 'in st al lation </w>': 1, 'at </w>': 1, 'h o m e .</w>': 1}\n",
      "Iter: 68\n",
      "Best pair: ('gra', 'm')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 1, 'h': 4, 'e': 10, '</w>': 12, 'a': 4, 'i': 4, 'm': 6, 's': 8, 'f': 1, 'o': 3, 'r': 5, 't': 6, 'u': 5, 'b': 2, 'j': 1, 'c': 4, 'd': 2, 'n': 5, 'v': 0, 'l': 5, 'p': 4, 'g': 3, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 0, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 3, 'e</w>': 7, 'an': 2, 's</w>': 3, 'ing': 0, 'or': 3, 'on': 2, 'at': 2, ',</w>': 4, 'd</w>': 0, 'ion': 1, 'for': 1, 'th': 3, 'de': 5, 'ation': 1, 'for</w>': 4, 'st': 4, 'ing</w>': 2, 'in</w>': 4, 'us': 0, 'ang': 1, 'ag': 1, 'ic': 2, 'is</w>': 3, 'nt': 1, 've': 1, 'op': 3, 'of': 1, 'al': 3, 'ed</w>': 1, 'lang': 0, 'langu': 0, 'languag': 0, 'language</w>': 3, 'ss': 1, 'ing,</w>': 3, 'cl': 1, 'and</w>': 3, '.</w>': 1, 'ar': 1, 'Th': 0, 'The</w>': 2, 'ms</w>': 2, 'ec': 2, 't</w>': 2, 'nts</w>': 2, 'to': 0, 'to</w>': 2, 'ding</w>': 2, 'of</w>': 2, 'the</w>': 2, 'ma': 2, 'used</w>': 2, 'pr': 0, 'pro': 2, 'use</w>': 2, 'ver': 2, 'ication': 2, 'incl': 0, 'inclu': 2, 'ass': 0, 'assi': 2, 'ans': 2, 'lation': 2, 'ing.</w>': 2, 'par': 2, 'gr': 0, 'gra': 0, 'gram': 2})\n",
      "Number of tokens: 100\n",
      "==========\n",
      "vocab,  {'The</w>': 2, 'a i ms</w>': 1, 'for</w>': 4, 'th is</w>': 1, 's u b j ec t</w>': 1, 'is</w>': 2, 'st u de nts</w>': 1, 'to</w>': 2, 'de ve l op </w>': 1, 'an </w>': 1, 'u n de r st an ding</w>': 1, 'of</w>': 2, 'the</w>': 2, 'ma in</w>': 1, 'al g or i th ms</w>': 1, 'used</w>': 2, 'in</w>': 3, 'n at u r al language</w>': 1, 'pro c e ss ing,</w>': 1, 'use</w>': 2, 'a </w>': 1, 'd i ver s e</w>': 1, 'r ang e</w>': 1, 'a p p l ication s</w>': 1, 'inclu ding</w>': 1, 't e x t</w>': 1, 'cl assi f ication ,</w>': 1, 'ma c h in e</w>': 1, 't r ans lation ,</w>': 1, 'and</w>': 3, 'q u e st ion </w>': 1, 'ans w e r ing.</w>': 1, 'T op ic s</w>': 1, 'b e</w>': 1, 'c o ver ed</w>': 1, 'inclu d e</w>': 1, 'par t - of - s p e ec h </w>': 1, 't ag g ing,</w>': 1, 'n - gram </w>': 1, 'language</w>': 2, 'm o de l l ing,</w>': 1, 's y nt a c t ic </w>': 1, 'par s ing</w>': 1, 'de e p </w>': 1, 'l e ar n ing.</w>': 1, 'pro gram m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h op s ,</w>': 1, 'assi g n m e nts</w>': 1, 'in st al lation </w>': 1, 'at </w>': 1, 'h o m e .</w>': 1}\n",
      "Iter: 69\n",
      "Best pair: ('m', 'e')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 1, 'h': 4, 'e': 8, '</w>': 12, 'a': 4, 'i': 4, 'm': 4, 's': 8, 'f': 1, 'o': 3, 'r': 5, 't': 6, 'u': 5, 'b': 2, 'j': 1, 'c': 4, 'd': 2, 'n': 5, 'v': 0, 'l': 5, 'p': 4, 'g': 3, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 0, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 3, 'e</w>': 7, 'an': 2, 's</w>': 3, 'ing': 0, 'or': 3, 'on': 2, 'at': 2, ',</w>': 4, 'd</w>': 0, 'ion': 1, 'for': 1, 'th': 3, 'de': 5, 'ation': 1, 'for</w>': 4, 'st': 4, 'ing</w>': 2, 'in</w>': 4, 'us': 0, 'ang': 1, 'ag': 1, 'ic': 2, 'is</w>': 3, 'nt': 1, 've': 1, 'op': 3, 'of': 1, 'al': 3, 'ed</w>': 1, 'lang': 0, 'langu': 0, 'languag': 0, 'language</w>': 3, 'ss': 1, 'ing,</w>': 3, 'cl': 1, 'and</w>': 3, '.</w>': 1, 'ar': 1, 'Th': 0, 'The</w>': 2, 'ms</w>': 2, 'ec': 2, 't</w>': 2, 'nts</w>': 2, 'to': 0, 'to</w>': 2, 'ding</w>': 2, 'of</w>': 2, 'the</w>': 2, 'ma': 2, 'used</w>': 2, 'pr': 0, 'pro': 2, 'use</w>': 2, 'ver': 2, 'ication': 2, 'incl': 0, 'inclu': 2, 'ass': 0, 'assi': 2, 'ans': 2, 'lation': 2, 'ing.</w>': 2, 'par': 2, 'gr': 0, 'gra': 0, 'gram': 2, 'me': 2})\n",
      "Number of tokens: 101\n",
      "==========\n",
      "vocab,  {'The</w>': 2, 'a i ms</w>': 1, 'for</w>': 4, 'th is</w>': 1, 's u b j ec t</w>': 1, 'is</w>': 2, 'st u de nts</w>': 1, 'to</w>': 2, 'de ve l op </w>': 1, 'an </w>': 1, 'u n de r st an ding</w>': 1, 'of</w>': 2, 'the</w>': 2, 'ma in</w>': 1, 'al g or i th ms</w>': 1, 'used</w>': 2, 'in</w>': 3, 'n at u r al language</w>': 1, 'pro c e ss ing,</w>': 1, 'use</w>': 2, 'a </w>': 1, 'd i ver s e</w>': 1, 'r ang e</w>': 1, 'a p p l ication s</w>': 1, 'inclu ding</w>': 1, 't e x t</w>': 1, 'cl assi f ication ,</w>': 1, 'ma c h in e</w>': 1, 't r ans lation ,</w>': 1, 'and</w>': 3, 'q u e st ion </w>': 1, 'ans w e r ing.</w>': 1, 'T op ic s</w>': 1, 'b e</w>': 1, 'c o ver ed</w>': 1, 'inclu d e</w>': 1, 'par t - of - s p e ec h </w>': 1, 't ag g ing,</w>': 1, 'n - gram </w>': 1, 'language</w>': 2, 'm o de l l ing,</w>': 1, 's y nt a c t ic </w>': 1, 'par s ing</w>': 1, 'de e p </w>': 1, 'l e ar n ing.</w>': 1, 'pro gram m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h op s ,</w>': 1, 'assi g n me nts</w>': 1, 'in st al lation </w>': 1, 'at </w>': 1, 'h o me .</w>': 1}\n",
      "Iter: 70\n",
      "Best pair: ('a', 'i')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 1, 'h': 4, 'e': 8, '</w>': 12, 'a': 3, 'i': 3, 'm': 4, 's': 8, 'f': 1, 'o': 3, 'r': 5, 't': 6, 'u': 5, 'b': 2, 'j': 1, 'c': 4, 'd': 2, 'n': 5, 'v': 0, 'l': 5, 'p': 4, 'g': 3, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 0, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 3, 'e</w>': 7, 'an': 2, 's</w>': 3, 'ing': 0, 'or': 3, 'on': 2, 'at': 2, ',</w>': 4, 'd</w>': 0, 'ion': 1, 'for': 1, 'th': 3, 'de': 5, 'ation': 1, 'for</w>': 4, 'st': 4, 'ing</w>': 2, 'in</w>': 4, 'us': 0, 'ang': 1, 'ag': 1, 'ic': 2, 'is</w>': 3, 'nt': 1, 've': 1, 'op': 3, 'of': 1, 'al': 3, 'ed</w>': 1, 'lang': 0, 'langu': 0, 'languag': 0, 'language</w>': 3, 'ss': 1, 'ing,</w>': 3, 'cl': 1, 'and</w>': 3, '.</w>': 1, 'ar': 1, 'Th': 0, 'The</w>': 2, 'ms</w>': 2, 'ec': 2, 't</w>': 2, 'nts</w>': 2, 'to': 0, 'to</w>': 2, 'ding</w>': 2, 'of</w>': 2, 'the</w>': 2, 'ma': 2, 'used</w>': 2, 'pr': 0, 'pro': 2, 'use</w>': 2, 'ver': 2, 'ication': 2, 'incl': 0, 'inclu': 2, 'ass': 0, 'assi': 2, 'ans': 2, 'lation': 2, 'ing.</w>': 2, 'par': 2, 'gr': 0, 'gra': 0, 'gram': 2, 'me': 2, 'ai': 1})\n",
      "Number of tokens: 102\n",
      "==========\n",
      "vocab,  {'The</w>': 2, 'ai ms</w>': 1, 'for</w>': 4, 'th is</w>': 1, 's u b j ec t</w>': 1, 'is</w>': 2, 'st u de nts</w>': 1, 'to</w>': 2, 'de ve l op </w>': 1, 'an </w>': 1, 'u n de r st an ding</w>': 1, 'of</w>': 2, 'the</w>': 2, 'ma in</w>': 1, 'al g or i th ms</w>': 1, 'used</w>': 2, 'in</w>': 3, 'n at u r al language</w>': 1, 'pro c e ss ing,</w>': 1, 'use</w>': 2, 'a </w>': 1, 'd i ver s e</w>': 1, 'r ang e</w>': 1, 'a p p l ication s</w>': 1, 'inclu ding</w>': 1, 't e x t</w>': 1, 'cl assi f ication ,</w>': 1, 'ma c h in e</w>': 1, 't r ans lation ,</w>': 1, 'and</w>': 3, 'q u e st ion </w>': 1, 'ans w e r ing.</w>': 1, 'T op ic s</w>': 1, 'b e</w>': 1, 'c o ver ed</w>': 1, 'inclu d e</w>': 1, 'par t - of - s p e ec h </w>': 1, 't ag g ing,</w>': 1, 'n - gram </w>': 1, 'language</w>': 2, 'm o de l l ing,</w>': 1, 's y nt a c t ic </w>': 1, 'par s ing</w>': 1, 'de e p </w>': 1, 'l e ar n ing.</w>': 1, 'pro gram m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h op s ,</w>': 1, 'assi g n me nts</w>': 1, 'in st al lation </w>': 1, 'at </w>': 1, 'h o me .</w>': 1}\n",
      "Iter: 71\n",
      "Best pair: ('ai', 'ms</w>')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 1, 'h': 4, 'e': 8, '</w>': 12, 'a': 3, 'i': 3, 'm': 4, 's': 8, 'f': 1, 'o': 3, 'r': 5, 't': 6, 'u': 5, 'b': 2, 'j': 1, 'c': 4, 'd': 2, 'n': 5, 'v': 0, 'l': 5, 'p': 4, 'g': 3, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 0, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 3, 'e</w>': 7, 'an': 2, 's</w>': 3, 'ing': 0, 'or': 3, 'on': 2, 'at': 2, ',</w>': 4, 'd</w>': 0, 'ion': 1, 'for': 1, 'th': 3, 'de': 5, 'ation': 1, 'for</w>': 4, 'st': 4, 'ing</w>': 2, 'in</w>': 4, 'us': 0, 'ang': 1, 'ag': 1, 'ic': 2, 'is</w>': 3, 'nt': 1, 've': 1, 'op': 3, 'of': 1, 'al': 3, 'ed</w>': 1, 'lang': 0, 'langu': 0, 'languag': 0, 'language</w>': 3, 'ss': 1, 'ing,</w>': 3, 'cl': 1, 'and</w>': 3, '.</w>': 1, 'ar': 1, 'Th': 0, 'The</w>': 2, 'ms</w>': 1, 'ec': 2, 't</w>': 2, 'nts</w>': 2, 'to': 0, 'to</w>': 2, 'ding</w>': 2, 'of</w>': 2, 'the</w>': 2, 'ma': 2, 'used</w>': 2, 'pr': 0, 'pro': 2, 'use</w>': 2, 'ver': 2, 'ication': 2, 'incl': 0, 'inclu': 2, 'ass': 0, 'assi': 2, 'ans': 2, 'lation': 2, 'ing.</w>': 2, 'par': 2, 'gr': 0, 'gra': 0, 'gram': 2, 'me': 2, 'ai': 0, 'aims</w>': 1})\n",
      "Number of tokens: 103\n",
      "==========\n",
      "vocab,  {'The</w>': 2, 'aims</w>': 1, 'for</w>': 4, 'th is</w>': 1, 's u b j ec t</w>': 1, 'is</w>': 2, 'st u de nts</w>': 1, 'to</w>': 2, 'de ve l op </w>': 1, 'an </w>': 1, 'u n de r st an ding</w>': 1, 'of</w>': 2, 'the</w>': 2, 'ma in</w>': 1, 'al g or i th ms</w>': 1, 'used</w>': 2, 'in</w>': 3, 'n at u r al language</w>': 1, 'pro c e ss ing,</w>': 1, 'use</w>': 2, 'a </w>': 1, 'd i ver s e</w>': 1, 'r ang e</w>': 1, 'a p p l ication s</w>': 1, 'inclu ding</w>': 1, 't e x t</w>': 1, 'cl assi f ication ,</w>': 1, 'ma c h in e</w>': 1, 't r ans lation ,</w>': 1, 'and</w>': 3, 'q u e st ion </w>': 1, 'ans w e r ing.</w>': 1, 'T op ic s</w>': 1, 'b e</w>': 1, 'c o ver ed</w>': 1, 'inclu d e</w>': 1, 'par t - of - s p e ec h </w>': 1, 't ag g ing,</w>': 1, 'n - gram </w>': 1, 'language</w>': 2, 'm o de l l ing,</w>': 1, 's y nt a c t ic </w>': 1, 'par s ing</w>': 1, 'de e p </w>': 1, 'l e ar n ing.</w>': 1, 'pro gram m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h op s ,</w>': 1, 'assi g n me nts</w>': 1, 'in st al lation </w>': 1, 'at </w>': 1, 'h o me .</w>': 1}\n",
      "Iter: 72\n",
      "Best pair: ('th', 'is</w>')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 1, 'h': 4, 'e': 8, '</w>': 12, 'a': 3, 'i': 3, 'm': 4, 's': 8, 'f': 1, 'o': 3, 'r': 5, 't': 6, 'u': 5, 'b': 2, 'j': 1, 'c': 4, 'd': 2, 'n': 5, 'v': 0, 'l': 5, 'p': 4, 'g': 3, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 0, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 3, 'e</w>': 7, 'an': 2, 's</w>': 3, 'ing': 0, 'or': 3, 'on': 2, 'at': 2, ',</w>': 4, 'd</w>': 0, 'ion': 1, 'for': 1, 'th': 2, 'de': 5, 'ation': 1, 'for</w>': 4, 'st': 4, 'ing</w>': 2, 'in</w>': 4, 'us': 0, 'ang': 1, 'ag': 1, 'ic': 2, 'is</w>': 2, 'nt': 1, 've': 1, 'op': 3, 'of': 1, 'al': 3, 'ed</w>': 1, 'lang': 0, 'langu': 0, 'languag': 0, 'language</w>': 3, 'ss': 1, 'ing,</w>': 3, 'cl': 1, 'and</w>': 3, '.</w>': 1, 'ar': 1, 'Th': 0, 'The</w>': 2, 'ms</w>': 1, 'ec': 2, 't</w>': 2, 'nts</w>': 2, 'to': 0, 'to</w>': 2, 'ding</w>': 2, 'of</w>': 2, 'the</w>': 2, 'ma': 2, 'used</w>': 2, 'pr': 0, 'pro': 2, 'use</w>': 2, 'ver': 2, 'ication': 2, 'incl': 0, 'inclu': 2, 'ass': 0, 'assi': 2, 'ans': 2, 'lation': 2, 'ing.</w>': 2, 'par': 2, 'gr': 0, 'gra': 0, 'gram': 2, 'me': 2, 'ai': 0, 'aims</w>': 1, 'this</w>': 1})\n",
      "Number of tokens: 104\n",
      "==========\n",
      "vocab,  {'The</w>': 2, 'aims</w>': 1, 'for</w>': 4, 'this</w>': 1, 's u b j ec t</w>': 1, 'is</w>': 2, 'st u de nts</w>': 1, 'to</w>': 2, 'de ve l op </w>': 1, 'an </w>': 1, 'u n de r st an ding</w>': 1, 'of</w>': 2, 'the</w>': 2, 'ma in</w>': 1, 'al g or i th ms</w>': 1, 'used</w>': 2, 'in</w>': 3, 'n at u r al language</w>': 1, 'pro c e ss ing,</w>': 1, 'use</w>': 2, 'a </w>': 1, 'd i ver s e</w>': 1, 'r ang e</w>': 1, 'a p p l ication s</w>': 1, 'inclu ding</w>': 1, 't e x t</w>': 1, 'cl assi f ication ,</w>': 1, 'ma c h in e</w>': 1, 't r ans lation ,</w>': 1, 'and</w>': 3, 'q u e st ion </w>': 1, 'ans w e r ing.</w>': 1, 'T op ic s</w>': 1, 'b e</w>': 1, 'c o ver ed</w>': 1, 'inclu d e</w>': 1, 'par t - of - s p e ec h </w>': 1, 't ag g ing,</w>': 1, 'n - gram </w>': 1, 'language</w>': 2, 'm o de l l ing,</w>': 1, 's y nt a c t ic </w>': 1, 'par s ing</w>': 1, 'de e p </w>': 1, 'l e ar n ing.</w>': 1, 'pro gram m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h op s ,</w>': 1, 'assi g n me nts</w>': 1, 'in st al lation </w>': 1, 'at </w>': 1, 'h o me .</w>': 1}\n",
      "Iter: 73\n",
      "Best pair: ('s', 'u')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 1, 'h': 4, 'e': 8, '</w>': 12, 'a': 3, 'i': 3, 'm': 4, 's': 7, 'f': 1, 'o': 3, 'r': 5, 't': 6, 'u': 4, 'b': 2, 'j': 1, 'c': 4, 'd': 2, 'n': 5, 'v': 0, 'l': 5, 'p': 4, 'g': 3, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 0, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 3, 'e</w>': 7, 'an': 2, 's</w>': 3, 'ing': 0, 'or': 3, 'on': 2, 'at': 2, ',</w>': 4, 'd</w>': 0, 'ion': 1, 'for': 1, 'th': 2, 'de': 5, 'ation': 1, 'for</w>': 4, 'st': 4, 'ing</w>': 2, 'in</w>': 4, 'us': 0, 'ang': 1, 'ag': 1, 'ic': 2, 'is</w>': 2, 'nt': 1, 've': 1, 'op': 3, 'of': 1, 'al': 3, 'ed</w>': 1, 'lang': 0, 'langu': 0, 'languag': 0, 'language</w>': 3, 'ss': 1, 'ing,</w>': 3, 'cl': 1, 'and</w>': 3, '.</w>': 1, 'ar': 1, 'Th': 0, 'The</w>': 2, 'ms</w>': 1, 'ec': 2, 't</w>': 2, 'nts</w>': 2, 'to': 0, 'to</w>': 2, 'ding</w>': 2, 'of</w>': 2, 'the</w>': 2, 'ma': 2, 'used</w>': 2, 'pr': 0, 'pro': 2, 'use</w>': 2, 'ver': 2, 'ication': 2, 'incl': 0, 'inclu': 2, 'ass': 0, 'assi': 2, 'ans': 2, 'lation': 2, 'ing.</w>': 2, 'par': 2, 'gr': 0, 'gra': 0, 'gram': 2, 'me': 2, 'ai': 0, 'aims</w>': 1, 'this</w>': 1, 'su': 1})\n",
      "Number of tokens: 105\n",
      "==========\n",
      "vocab,  {'The</w>': 2, 'aims</w>': 1, 'for</w>': 4, 'this</w>': 1, 'su b j ec t</w>': 1, 'is</w>': 2, 'st u de nts</w>': 1, 'to</w>': 2, 'de ve l op </w>': 1, 'an </w>': 1, 'u n de r st an ding</w>': 1, 'of</w>': 2, 'the</w>': 2, 'ma in</w>': 1, 'al g or i th ms</w>': 1, 'used</w>': 2, 'in</w>': 3, 'n at u r al language</w>': 1, 'pro c e ss ing,</w>': 1, 'use</w>': 2, 'a </w>': 1, 'd i ver s e</w>': 1, 'r ang e</w>': 1, 'a p p l ication s</w>': 1, 'inclu ding</w>': 1, 't e x t</w>': 1, 'cl assi f ication ,</w>': 1, 'ma c h in e</w>': 1, 't r ans lation ,</w>': 1, 'and</w>': 3, 'q u e st ion </w>': 1, 'ans w e r ing.</w>': 1, 'T op ic s</w>': 1, 'b e</w>': 1, 'c o ver ed</w>': 1, 'inclu d e</w>': 1, 'par t - of - s p e ec h </w>': 1, 't ag g ing,</w>': 1, 'n - gram </w>': 1, 'language</w>': 2, 'm o de l l ing,</w>': 1, 's y nt a c t ic </w>': 1, 'par s ing</w>': 1, 'de e p </w>': 1, 'l e ar n ing.</w>': 1, 'pro gram m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h op s ,</w>': 1, 'assi g n me nts</w>': 1, 'in st al lation </w>': 1, 'at </w>': 1, 'h o me .</w>': 1}\n",
      "Iter: 74\n",
      "Best pair: ('su', 'b')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 1, 'h': 4, 'e': 8, '</w>': 12, 'a': 3, 'i': 3, 'm': 4, 's': 7, 'f': 1, 'o': 3, 'r': 5, 't': 6, 'u': 4, 'b': 1, 'j': 1, 'c': 4, 'd': 2, 'n': 5, 'v': 0, 'l': 5, 'p': 4, 'g': 3, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 0, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 3, 'e</w>': 7, 'an': 2, 's</w>': 3, 'ing': 0, 'or': 3, 'on': 2, 'at': 2, ',</w>': 4, 'd</w>': 0, 'ion': 1, 'for': 1, 'th': 2, 'de': 5, 'ation': 1, 'for</w>': 4, 'st': 4, 'ing</w>': 2, 'in</w>': 4, 'us': 0, 'ang': 1, 'ag': 1, 'ic': 2, 'is</w>': 2, 'nt': 1, 've': 1, 'op': 3, 'of': 1, 'al': 3, 'ed</w>': 1, 'lang': 0, 'langu': 0, 'languag': 0, 'language</w>': 3, 'ss': 1, 'ing,</w>': 3, 'cl': 1, 'and</w>': 3, '.</w>': 1, 'ar': 1, 'Th': 0, 'The</w>': 2, 'ms</w>': 1, 'ec': 2, 't</w>': 2, 'nts</w>': 2, 'to': 0, 'to</w>': 2, 'ding</w>': 2, 'of</w>': 2, 'the</w>': 2, 'ma': 2, 'used</w>': 2, 'pr': 0, 'pro': 2, 'use</w>': 2, 'ver': 2, 'ication': 2, 'incl': 0, 'inclu': 2, 'ass': 0, 'assi': 2, 'ans': 2, 'lation': 2, 'ing.</w>': 2, 'par': 2, 'gr': 0, 'gra': 0, 'gram': 2, 'me': 2, 'ai': 0, 'aims</w>': 1, 'this</w>': 1, 'su': 0, 'sub': 1})\n",
      "Number of tokens: 106\n",
      "==========\n",
      "vocab,  {'The</w>': 2, 'aims</w>': 1, 'for</w>': 4, 'this</w>': 1, 'sub j ec t</w>': 1, 'is</w>': 2, 'st u de nts</w>': 1, 'to</w>': 2, 'de ve l op </w>': 1, 'an </w>': 1, 'u n de r st an ding</w>': 1, 'of</w>': 2, 'the</w>': 2, 'ma in</w>': 1, 'al g or i th ms</w>': 1, 'used</w>': 2, 'in</w>': 3, 'n at u r al language</w>': 1, 'pro c e ss ing,</w>': 1, 'use</w>': 2, 'a </w>': 1, 'd i ver s e</w>': 1, 'r ang e</w>': 1, 'a p p l ication s</w>': 1, 'inclu ding</w>': 1, 't e x t</w>': 1, 'cl assi f ication ,</w>': 1, 'ma c h in e</w>': 1, 't r ans lation ,</w>': 1, 'and</w>': 3, 'q u e st ion </w>': 1, 'ans w e r ing.</w>': 1, 'T op ic s</w>': 1, 'b e</w>': 1, 'c o ver ed</w>': 1, 'inclu d e</w>': 1, 'par t - of - s p e ec h </w>': 1, 't ag g ing,</w>': 1, 'n - gram </w>': 1, 'language</w>': 2, 'm o de l l ing,</w>': 1, 's y nt a c t ic </w>': 1, 'par s ing</w>': 1, 'de e p </w>': 1, 'l e ar n ing.</w>': 1, 'pro gram m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h op s ,</w>': 1, 'assi g n me nts</w>': 1, 'in st al lation </w>': 1, 'at </w>': 1, 'h o me .</w>': 1}\n",
      "Iter: 75\n",
      "Best pair: ('sub', 'j')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 1, 'h': 4, 'e': 8, '</w>': 12, 'a': 3, 'i': 3, 'm': 4, 's': 7, 'f': 1, 'o': 3, 'r': 5, 't': 6, 'u': 4, 'b': 1, 'j': 0, 'c': 4, 'd': 2, 'n': 5, 'v': 0, 'l': 5, 'p': 4, 'g': 3, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 0, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 3, 'e</w>': 7, 'an': 2, 's</w>': 3, 'ing': 0, 'or': 3, 'on': 2, 'at': 2, ',</w>': 4, 'd</w>': 0, 'ion': 1, 'for': 1, 'th': 2, 'de': 5, 'ation': 1, 'for</w>': 4, 'st': 4, 'ing</w>': 2, 'in</w>': 4, 'us': 0, 'ang': 1, 'ag': 1, 'ic': 2, 'is</w>': 2, 'nt': 1, 've': 1, 'op': 3, 'of': 1, 'al': 3, 'ed</w>': 1, 'lang': 0, 'langu': 0, 'languag': 0, 'language</w>': 3, 'ss': 1, 'ing,</w>': 3, 'cl': 1, 'and</w>': 3, '.</w>': 1, 'ar': 1, 'Th': 0, 'The</w>': 2, 'ms</w>': 1, 'ec': 2, 't</w>': 2, 'nts</w>': 2, 'to': 0, 'to</w>': 2, 'ding</w>': 2, 'of</w>': 2, 'the</w>': 2, 'ma': 2, 'used</w>': 2, 'pr': 0, 'pro': 2, 'use</w>': 2, 'ver': 2, 'ication': 2, 'incl': 0, 'inclu': 2, 'ass': 0, 'assi': 2, 'ans': 2, 'lation': 2, 'ing.</w>': 2, 'par': 2, 'gr': 0, 'gra': 0, 'gram': 2, 'me': 2, 'ai': 0, 'aims</w>': 1, 'this</w>': 1, 'su': 0, 'sub': 0, 'subj': 1})\n",
      "Number of tokens: 107\n",
      "==========\n",
      "vocab,  {'The</w>': 2, 'aims</w>': 1, 'for</w>': 4, 'this</w>': 1, 'subj ec t</w>': 1, 'is</w>': 2, 'st u de nts</w>': 1, 'to</w>': 2, 'de ve l op </w>': 1, 'an </w>': 1, 'u n de r st an ding</w>': 1, 'of</w>': 2, 'the</w>': 2, 'ma in</w>': 1, 'al g or i th ms</w>': 1, 'used</w>': 2, 'in</w>': 3, 'n at u r al language</w>': 1, 'pro c e ss ing,</w>': 1, 'use</w>': 2, 'a </w>': 1, 'd i ver s e</w>': 1, 'r ang e</w>': 1, 'a p p l ication s</w>': 1, 'inclu ding</w>': 1, 't e x t</w>': 1, 'cl assi f ication ,</w>': 1, 'ma c h in e</w>': 1, 't r ans lation ,</w>': 1, 'and</w>': 3, 'q u e st ion </w>': 1, 'ans w e r ing.</w>': 1, 'T op ic s</w>': 1, 'b e</w>': 1, 'c o ver ed</w>': 1, 'inclu d e</w>': 1, 'par t - of - s p e ec h </w>': 1, 't ag g ing,</w>': 1, 'n - gram </w>': 1, 'language</w>': 2, 'm o de l l ing,</w>': 1, 's y nt a c t ic </w>': 1, 'par s ing</w>': 1, 'de e p </w>': 1, 'l e ar n ing.</w>': 1, 'pro gram m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h op s ,</w>': 1, 'assi g n me nts</w>': 1, 'in st al lation </w>': 1, 'at </w>': 1, 'h o me .</w>': 1}\n",
      "Iter: 76\n",
      "Best pair: ('subj', 'ec')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 1, 'h': 4, 'e': 8, '</w>': 12, 'a': 3, 'i': 3, 'm': 4, 's': 7, 'f': 1, 'o': 3, 'r': 5, 't': 6, 'u': 4, 'b': 1, 'j': 0, 'c': 4, 'd': 2, 'n': 5, 'v': 0, 'l': 5, 'p': 4, 'g': 3, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 0, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 3, 'e</w>': 7, 'an': 2, 's</w>': 3, 'ing': 0, 'or': 3, 'on': 2, 'at': 2, ',</w>': 4, 'd</w>': 0, 'ion': 1, 'for': 1, 'th': 2, 'de': 5, 'ation': 1, 'for</w>': 4, 'st': 4, 'ing</w>': 2, 'in</w>': 4, 'us': 0, 'ang': 1, 'ag': 1, 'ic': 2, 'is</w>': 2, 'nt': 1, 've': 1, 'op': 3, 'of': 1, 'al': 3, 'ed</w>': 1, 'lang': 0, 'langu': 0, 'languag': 0, 'language</w>': 3, 'ss': 1, 'ing,</w>': 3, 'cl': 1, 'and</w>': 3, '.</w>': 1, 'ar': 1, 'Th': 0, 'The</w>': 2, 'ms</w>': 1, 'ec': 1, 't</w>': 2, 'nts</w>': 2, 'to': 0, 'to</w>': 2, 'ding</w>': 2, 'of</w>': 2, 'the</w>': 2, 'ma': 2, 'used</w>': 2, 'pr': 0, 'pro': 2, 'use</w>': 2, 'ver': 2, 'ication': 2, 'incl': 0, 'inclu': 2, 'ass': 0, 'assi': 2, 'ans': 2, 'lation': 2, 'ing.</w>': 2, 'par': 2, 'gr': 0, 'gra': 0, 'gram': 2, 'me': 2, 'ai': 0, 'aims</w>': 1, 'this</w>': 1, 'su': 0, 'sub': 0, 'subj': 0, 'subjec': 1})\n",
      "Number of tokens: 108\n",
      "==========\n",
      "vocab,  {'The</w>': 2, 'aims</w>': 1, 'for</w>': 4, 'this</w>': 1, 'subjec t</w>': 1, 'is</w>': 2, 'st u de nts</w>': 1, 'to</w>': 2, 'de ve l op </w>': 1, 'an </w>': 1, 'u n de r st an ding</w>': 1, 'of</w>': 2, 'the</w>': 2, 'ma in</w>': 1, 'al g or i th ms</w>': 1, 'used</w>': 2, 'in</w>': 3, 'n at u r al language</w>': 1, 'pro c e ss ing,</w>': 1, 'use</w>': 2, 'a </w>': 1, 'd i ver s e</w>': 1, 'r ang e</w>': 1, 'a p p l ication s</w>': 1, 'inclu ding</w>': 1, 't e x t</w>': 1, 'cl assi f ication ,</w>': 1, 'ma c h in e</w>': 1, 't r ans lation ,</w>': 1, 'and</w>': 3, 'q u e st ion </w>': 1, 'ans w e r ing.</w>': 1, 'T op ic s</w>': 1, 'b e</w>': 1, 'c o ver ed</w>': 1, 'inclu d e</w>': 1, 'par t - of - s p e ec h </w>': 1, 't ag g ing,</w>': 1, 'n - gram </w>': 1, 'language</w>': 2, 'm o de l l ing,</w>': 1, 's y nt a c t ic </w>': 1, 'par s ing</w>': 1, 'de e p </w>': 1, 'l e ar n ing.</w>': 1, 'pro gram m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h op s ,</w>': 1, 'assi g n me nts</w>': 1, 'in st al lation </w>': 1, 'at </w>': 1, 'h o me .</w>': 1}\n",
      "Iter: 77\n",
      "Best pair: ('subjec', 't</w>')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 1, 'h': 4, 'e': 8, '</w>': 12, 'a': 3, 'i': 3, 'm': 4, 's': 7, 'f': 1, 'o': 3, 'r': 5, 't': 6, 'u': 4, 'b': 1, 'j': 0, 'c': 4, 'd': 2, 'n': 5, 'v': 0, 'l': 5, 'p': 4, 'g': 3, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 0, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 3, 'e</w>': 7, 'an': 2, 's</w>': 3, 'ing': 0, 'or': 3, 'on': 2, 'at': 2, ',</w>': 4, 'd</w>': 0, 'ion': 1, 'for': 1, 'th': 2, 'de': 5, 'ation': 1, 'for</w>': 4, 'st': 4, 'ing</w>': 2, 'in</w>': 4, 'us': 0, 'ang': 1, 'ag': 1, 'ic': 2, 'is</w>': 2, 'nt': 1, 've': 1, 'op': 3, 'of': 1, 'al': 3, 'ed</w>': 1, 'lang': 0, 'langu': 0, 'languag': 0, 'language</w>': 3, 'ss': 1, 'ing,</w>': 3, 'cl': 1, 'and</w>': 3, '.</w>': 1, 'ar': 1, 'Th': 0, 'The</w>': 2, 'ms</w>': 1, 'ec': 1, 't</w>': 1, 'nts</w>': 2, 'to': 0, 'to</w>': 2, 'ding</w>': 2, 'of</w>': 2, 'the</w>': 2, 'ma': 2, 'used</w>': 2, 'pr': 0, 'pro': 2, 'use</w>': 2, 'ver': 2, 'ication': 2, 'incl': 0, 'inclu': 2, 'ass': 0, 'assi': 2, 'ans': 2, 'lation': 2, 'ing.</w>': 2, 'par': 2, 'gr': 0, 'gra': 0, 'gram': 2, 'me': 2, 'ai': 0, 'aims</w>': 1, 'this</w>': 1, 'su': 0, 'sub': 0, 'subj': 0, 'subjec': 0, 'subject</w>': 1})\n",
      "Number of tokens: 109\n",
      "==========\n",
      "vocab,  {'The</w>': 2, 'aims</w>': 1, 'for</w>': 4, 'this</w>': 1, 'subject</w>': 1, 'is</w>': 2, 'st u de nts</w>': 1, 'to</w>': 2, 'de ve l op </w>': 1, 'an </w>': 1, 'u n de r st an ding</w>': 1, 'of</w>': 2, 'the</w>': 2, 'ma in</w>': 1, 'al g or i th ms</w>': 1, 'used</w>': 2, 'in</w>': 3, 'n at u r al language</w>': 1, 'pro c e ss ing,</w>': 1, 'use</w>': 2, 'a </w>': 1, 'd i ver s e</w>': 1, 'r ang e</w>': 1, 'a p p l ication s</w>': 1, 'inclu ding</w>': 1, 't e x t</w>': 1, 'cl assi f ication ,</w>': 1, 'ma c h in e</w>': 1, 't r ans lation ,</w>': 1, 'and</w>': 3, 'q u e st ion </w>': 1, 'ans w e r ing.</w>': 1, 'T op ic s</w>': 1, 'b e</w>': 1, 'c o ver ed</w>': 1, 'inclu d e</w>': 1, 'par t - of - s p e ec h </w>': 1, 't ag g ing,</w>': 1, 'n - gram </w>': 1, 'language</w>': 2, 'm o de l l ing,</w>': 1, 's y nt a c t ic </w>': 1, 'par s ing</w>': 1, 'de e p </w>': 1, 'l e ar n ing.</w>': 1, 'pro gram m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h op s ,</w>': 1, 'assi g n me nts</w>': 1, 'in st al lation </w>': 1, 'at </w>': 1, 'h o me .</w>': 1}\n",
      "Iter: 78\n",
      "Best pair: ('st', 'u')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 1, 'h': 4, 'e': 8, '</w>': 12, 'a': 3, 'i': 3, 'm': 4, 's': 7, 'f': 1, 'o': 3, 'r': 5, 't': 6, 'u': 3, 'b': 1, 'j': 0, 'c': 4, 'd': 2, 'n': 5, 'v': 0, 'l': 5, 'p': 4, 'g': 3, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 0, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 3, 'e</w>': 7, 'an': 2, 's</w>': 3, 'ing': 0, 'or': 3, 'on': 2, 'at': 2, ',</w>': 4, 'd</w>': 0, 'ion': 1, 'for': 1, 'th': 2, 'de': 5, 'ation': 1, 'for</w>': 4, 'st': 3, 'ing</w>': 2, 'in</w>': 4, 'us': 0, 'ang': 1, 'ag': 1, 'ic': 2, 'is</w>': 2, 'nt': 1, 've': 1, 'op': 3, 'of': 1, 'al': 3, 'ed</w>': 1, 'lang': 0, 'langu': 0, 'languag': 0, 'language</w>': 3, 'ss': 1, 'ing,</w>': 3, 'cl': 1, 'and</w>': 3, '.</w>': 1, 'ar': 1, 'Th': 0, 'The</w>': 2, 'ms</w>': 1, 'ec': 1, 't</w>': 1, 'nts</w>': 2, 'to': 0, 'to</w>': 2, 'ding</w>': 2, 'of</w>': 2, 'the</w>': 2, 'ma': 2, 'used</w>': 2, 'pr': 0, 'pro': 2, 'use</w>': 2, 'ver': 2, 'ication': 2, 'incl': 0, 'inclu': 2, 'ass': 0, 'assi': 2, 'ans': 2, 'lation': 2, 'ing.</w>': 2, 'par': 2, 'gr': 0, 'gra': 0, 'gram': 2, 'me': 2, 'ai': 0, 'aims</w>': 1, 'this</w>': 1, 'su': 0, 'sub': 0, 'subj': 0, 'subjec': 0, 'subject</w>': 1, 'stu': 1})\n",
      "Number of tokens: 110\n",
      "==========\n",
      "vocab,  {'The</w>': 2, 'aims</w>': 1, 'for</w>': 4, 'this</w>': 1, 'subject</w>': 1, 'is</w>': 2, 'stu de nts</w>': 1, 'to</w>': 2, 'de ve l op </w>': 1, 'an </w>': 1, 'u n de r st an ding</w>': 1, 'of</w>': 2, 'the</w>': 2, 'ma in</w>': 1, 'al g or i th ms</w>': 1, 'used</w>': 2, 'in</w>': 3, 'n at u r al language</w>': 1, 'pro c e ss ing,</w>': 1, 'use</w>': 2, 'a </w>': 1, 'd i ver s e</w>': 1, 'r ang e</w>': 1, 'a p p l ication s</w>': 1, 'inclu ding</w>': 1, 't e x t</w>': 1, 'cl assi f ication ,</w>': 1, 'ma c h in e</w>': 1, 't r ans lation ,</w>': 1, 'and</w>': 3, 'q u e st ion </w>': 1, 'ans w e r ing.</w>': 1, 'T op ic s</w>': 1, 'b e</w>': 1, 'c o ver ed</w>': 1, 'inclu d e</w>': 1, 'par t - of - s p e ec h </w>': 1, 't ag g ing,</w>': 1, 'n - gram </w>': 1, 'language</w>': 2, 'm o de l l ing,</w>': 1, 's y nt a c t ic </w>': 1, 'par s ing</w>': 1, 'de e p </w>': 1, 'l e ar n ing.</w>': 1, 'pro gram m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h op s ,</w>': 1, 'assi g n me nts</w>': 1, 'in st al lation </w>': 1, 'at </w>': 1, 'h o me .</w>': 1}\n",
      "Iter: 79\n",
      "Best pair: ('stu', 'de')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 1, 'h': 4, 'e': 8, '</w>': 12, 'a': 3, 'i': 3, 'm': 4, 's': 7, 'f': 1, 'o': 3, 'r': 5, 't': 6, 'u': 3, 'b': 1, 'j': 0, 'c': 4, 'd': 2, 'n': 5, 'v': 0, 'l': 5, 'p': 4, 'g': 3, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 0, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 3, 'e</w>': 7, 'an': 2, 's</w>': 3, 'ing': 0, 'or': 3, 'on': 2, 'at': 2, ',</w>': 4, 'd</w>': 0, 'ion': 1, 'for': 1, 'th': 2, 'de': 4, 'ation': 1, 'for</w>': 4, 'st': 3, 'ing</w>': 2, 'in</w>': 4, 'us': 0, 'ang': 1, 'ag': 1, 'ic': 2, 'is</w>': 2, 'nt': 1, 've': 1, 'op': 3, 'of': 1, 'al': 3, 'ed</w>': 1, 'lang': 0, 'langu': 0, 'languag': 0, 'language</w>': 3, 'ss': 1, 'ing,</w>': 3, 'cl': 1, 'and</w>': 3, '.</w>': 1, 'ar': 1, 'Th': 0, 'The</w>': 2, 'ms</w>': 1, 'ec': 1, 't</w>': 1, 'nts</w>': 2, 'to': 0, 'to</w>': 2, 'ding</w>': 2, 'of</w>': 2, 'the</w>': 2, 'ma': 2, 'used</w>': 2, 'pr': 0, 'pro': 2, 'use</w>': 2, 'ver': 2, 'ication': 2, 'incl': 0, 'inclu': 2, 'ass': 0, 'assi': 2, 'ans': 2, 'lation': 2, 'ing.</w>': 2, 'par': 2, 'gr': 0, 'gra': 0, 'gram': 2, 'me': 2, 'ai': 0, 'aims</w>': 1, 'this</w>': 1, 'su': 0, 'sub': 0, 'subj': 0, 'subjec': 0, 'subject</w>': 1, 'stu': 0, 'stude': 1})\n",
      "Number of tokens: 111\n",
      "==========\n",
      "vocab,  {'The</w>': 2, 'aims</w>': 1, 'for</w>': 4, 'this</w>': 1, 'subject</w>': 1, 'is</w>': 2, 'stude nts</w>': 1, 'to</w>': 2, 'de ve l op </w>': 1, 'an </w>': 1, 'u n de r st an ding</w>': 1, 'of</w>': 2, 'the</w>': 2, 'ma in</w>': 1, 'al g or i th ms</w>': 1, 'used</w>': 2, 'in</w>': 3, 'n at u r al language</w>': 1, 'pro c e ss ing,</w>': 1, 'use</w>': 2, 'a </w>': 1, 'd i ver s e</w>': 1, 'r ang e</w>': 1, 'a p p l ication s</w>': 1, 'inclu ding</w>': 1, 't e x t</w>': 1, 'cl assi f ication ,</w>': 1, 'ma c h in e</w>': 1, 't r ans lation ,</w>': 1, 'and</w>': 3, 'q u e st ion </w>': 1, 'ans w e r ing.</w>': 1, 'T op ic s</w>': 1, 'b e</w>': 1, 'c o ver ed</w>': 1, 'inclu d e</w>': 1, 'par t - of - s p e ec h </w>': 1, 't ag g ing,</w>': 1, 'n - gram </w>': 1, 'language</w>': 2, 'm o de l l ing,</w>': 1, 's y nt a c t ic </w>': 1, 'par s ing</w>': 1, 'de e p </w>': 1, 'l e ar n ing.</w>': 1, 'pro gram m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h op s ,</w>': 1, 'assi g n me nts</w>': 1, 'in st al lation </w>': 1, 'at </w>': 1, 'h o me .</w>': 1}\n",
      "Iter: 80\n",
      "Best pair: ('stude', 'nts</w>')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 1, 'h': 4, 'e': 8, '</w>': 12, 'a': 3, 'i': 3, 'm': 4, 's': 7, 'f': 1, 'o': 3, 'r': 5, 't': 6, 'u': 3, 'b': 1, 'j': 0, 'c': 4, 'd': 2, 'n': 5, 'v': 0, 'l': 5, 'p': 4, 'g': 3, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 0, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 3, 'e</w>': 7, 'an': 2, 's</w>': 3, 'ing': 0, 'or': 3, 'on': 2, 'at': 2, ',</w>': 4, 'd</w>': 0, 'ion': 1, 'for': 1, 'th': 2, 'de': 4, 'ation': 1, 'for</w>': 4, 'st': 3, 'ing</w>': 2, 'in</w>': 4, 'us': 0, 'ang': 1, 'ag': 1, 'ic': 2, 'is</w>': 2, 'nt': 1, 've': 1, 'op': 3, 'of': 1, 'al': 3, 'ed</w>': 1, 'lang': 0, 'langu': 0, 'languag': 0, 'language</w>': 3, 'ss': 1, 'ing,</w>': 3, 'cl': 1, 'and</w>': 3, '.</w>': 1, 'ar': 1, 'Th': 0, 'The</w>': 2, 'ms</w>': 1, 'ec': 1, 't</w>': 1, 'nts</w>': 1, 'to': 0, 'to</w>': 2, 'ding</w>': 2, 'of</w>': 2, 'the</w>': 2, 'ma': 2, 'used</w>': 2, 'pr': 0, 'pro': 2, 'use</w>': 2, 'ver': 2, 'ication': 2, 'incl': 0, 'inclu': 2, 'ass': 0, 'assi': 2, 'ans': 2, 'lation': 2, 'ing.</w>': 2, 'par': 2, 'gr': 0, 'gra': 0, 'gram': 2, 'me': 2, 'ai': 0, 'aims</w>': 1, 'this</w>': 1, 'su': 0, 'sub': 0, 'subj': 0, 'subjec': 0, 'subject</w>': 1, 'stu': 0, 'stude': 0, 'students</w>': 1})\n",
      "Number of tokens: 112\n",
      "==========\n",
      "vocab,  {'The</w>': 2, 'aims</w>': 1, 'for</w>': 4, 'this</w>': 1, 'subject</w>': 1, 'is</w>': 2, 'students</w>': 1, 'to</w>': 2, 'de ve l op </w>': 1, 'an </w>': 1, 'u n de r st an ding</w>': 1, 'of</w>': 2, 'the</w>': 2, 'ma in</w>': 1, 'al g or i th ms</w>': 1, 'used</w>': 2, 'in</w>': 3, 'n at u r al language</w>': 1, 'pro c e ss ing,</w>': 1, 'use</w>': 2, 'a </w>': 1, 'd i ver s e</w>': 1, 'r ang e</w>': 1, 'a p p l ication s</w>': 1, 'inclu ding</w>': 1, 't e x t</w>': 1, 'cl assi f ication ,</w>': 1, 'ma c h in e</w>': 1, 't r ans lation ,</w>': 1, 'and</w>': 3, 'q u e st ion </w>': 1, 'ans w e r ing.</w>': 1, 'T op ic s</w>': 1, 'b e</w>': 1, 'c o ver ed</w>': 1, 'inclu d e</w>': 1, 'par t - of - s p e ec h </w>': 1, 't ag g ing,</w>': 1, 'n - gram </w>': 1, 'language</w>': 2, 'm o de l l ing,</w>': 1, 's y nt a c t ic </w>': 1, 'par s ing</w>': 1, 'de e p </w>': 1, 'l e ar n ing.</w>': 1, 'pro gram m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h op s ,</w>': 1, 'assi g n me nts</w>': 1, 'in st al lation </w>': 1, 'at </w>': 1, 'h o me .</w>': 1}\n",
      "Iter: 81\n",
      "Best pair: ('de', 've')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 1, 'h': 4, 'e': 8, '</w>': 12, 'a': 3, 'i': 3, 'm': 4, 's': 7, 'f': 1, 'o': 3, 'r': 5, 't': 6, 'u': 3, 'b': 1, 'j': 0, 'c': 4, 'd': 2, 'n': 5, 'v': 0, 'l': 5, 'p': 4, 'g': 3, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 0, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 3, 'e</w>': 7, 'an': 2, 's</w>': 3, 'ing': 0, 'or': 3, 'on': 2, 'at': 2, ',</w>': 4, 'd</w>': 0, 'ion': 1, 'for': 1, 'th': 2, 'de': 3, 'ation': 1, 'for</w>': 4, 'st': 3, 'ing</w>': 2, 'in</w>': 4, 'us': 0, 'ang': 1, 'ag': 1, 'ic': 2, 'is</w>': 2, 'nt': 1, 've': 0, 'op': 3, 'of': 1, 'al': 3, 'ed</w>': 1, 'lang': 0, 'langu': 0, 'languag': 0, 'language</w>': 3, 'ss': 1, 'ing,</w>': 3, 'cl': 1, 'and</w>': 3, '.</w>': 1, 'ar': 1, 'Th': 0, 'The</w>': 2, 'ms</w>': 1, 'ec': 1, 't</w>': 1, 'nts</w>': 1, 'to': 0, 'to</w>': 2, 'ding</w>': 2, 'of</w>': 2, 'the</w>': 2, 'ma': 2, 'used</w>': 2, 'pr': 0, 'pro': 2, 'use</w>': 2, 'ver': 2, 'ication': 2, 'incl': 0, 'inclu': 2, 'ass': 0, 'assi': 2, 'ans': 2, 'lation': 2, 'ing.</w>': 2, 'par': 2, 'gr': 0, 'gra': 0, 'gram': 2, 'me': 2, 'ai': 0, 'aims</w>': 1, 'this</w>': 1, 'su': 0, 'sub': 0, 'subj': 0, 'subjec': 0, 'subject</w>': 1, 'stu': 0, 'stude': 0, 'students</w>': 1, 'deve': 1})\n",
      "Number of tokens: 113\n",
      "==========\n",
      "vocab,  {'The</w>': 2, 'aims</w>': 1, 'for</w>': 4, 'this</w>': 1, 'subject</w>': 1, 'is</w>': 2, 'students</w>': 1, 'to</w>': 2, 'deve l op </w>': 1, 'an </w>': 1, 'u n de r st an ding</w>': 1, 'of</w>': 2, 'the</w>': 2, 'ma in</w>': 1, 'al g or i th ms</w>': 1, 'used</w>': 2, 'in</w>': 3, 'n at u r al language</w>': 1, 'pro c e ss ing,</w>': 1, 'use</w>': 2, 'a </w>': 1, 'd i ver s e</w>': 1, 'r ang e</w>': 1, 'a p p l ication s</w>': 1, 'inclu ding</w>': 1, 't e x t</w>': 1, 'cl assi f ication ,</w>': 1, 'ma c h in e</w>': 1, 't r ans lation ,</w>': 1, 'and</w>': 3, 'q u e st ion </w>': 1, 'ans w e r ing.</w>': 1, 'T op ic s</w>': 1, 'b e</w>': 1, 'c o ver ed</w>': 1, 'inclu d e</w>': 1, 'par t - of - s p e ec h </w>': 1, 't ag g ing,</w>': 1, 'n - gram </w>': 1, 'language</w>': 2, 'm o de l l ing,</w>': 1, 's y nt a c t ic </w>': 1, 'par s ing</w>': 1, 'de e p </w>': 1, 'l e ar n ing.</w>': 1, 'pro gram m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h op s ,</w>': 1, 'assi g n me nts</w>': 1, 'in st al lation </w>': 1, 'at </w>': 1, 'h o me .</w>': 1}\n",
      "Iter: 82\n",
      "Best pair: ('deve', 'l')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 1, 'h': 4, 'e': 8, '</w>': 12, 'a': 3, 'i': 3, 'm': 4, 's': 7, 'f': 1, 'o': 3, 'r': 5, 't': 6, 'u': 3, 'b': 1, 'j': 0, 'c': 4, 'd': 2, 'n': 5, 'v': 0, 'l': 4, 'p': 4, 'g': 3, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 0, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 3, 'e</w>': 7, 'an': 2, 's</w>': 3, 'ing': 0, 'or': 3, 'on': 2, 'at': 2, ',</w>': 4, 'd</w>': 0, 'ion': 1, 'for': 1, 'th': 2, 'de': 3, 'ation': 1, 'for</w>': 4, 'st': 3, 'ing</w>': 2, 'in</w>': 4, 'us': 0, 'ang': 1, 'ag': 1, 'ic': 2, 'is</w>': 2, 'nt': 1, 've': 0, 'op': 3, 'of': 1, 'al': 3, 'ed</w>': 1, 'lang': 0, 'langu': 0, 'languag': 0, 'language</w>': 3, 'ss': 1, 'ing,</w>': 3, 'cl': 1, 'and</w>': 3, '.</w>': 1, 'ar': 1, 'Th': 0, 'The</w>': 2, 'ms</w>': 1, 'ec': 1, 't</w>': 1, 'nts</w>': 1, 'to': 0, 'to</w>': 2, 'ding</w>': 2, 'of</w>': 2, 'the</w>': 2, 'ma': 2, 'used</w>': 2, 'pr': 0, 'pro': 2, 'use</w>': 2, 'ver': 2, 'ication': 2, 'incl': 0, 'inclu': 2, 'ass': 0, 'assi': 2, 'ans': 2, 'lation': 2, 'ing.</w>': 2, 'par': 2, 'gr': 0, 'gra': 0, 'gram': 2, 'me': 2, 'ai': 0, 'aims</w>': 1, 'this</w>': 1, 'su': 0, 'sub': 0, 'subj': 0, 'subjec': 0, 'subject</w>': 1, 'stu': 0, 'stude': 0, 'students</w>': 1, 'deve': 0, 'devel': 1})\n",
      "Number of tokens: 114\n",
      "==========\n",
      "vocab,  {'The</w>': 2, 'aims</w>': 1, 'for</w>': 4, 'this</w>': 1, 'subject</w>': 1, 'is</w>': 2, 'students</w>': 1, 'to</w>': 2, 'devel op </w>': 1, 'an </w>': 1, 'u n de r st an ding</w>': 1, 'of</w>': 2, 'the</w>': 2, 'ma in</w>': 1, 'al g or i th ms</w>': 1, 'used</w>': 2, 'in</w>': 3, 'n at u r al language</w>': 1, 'pro c e ss ing,</w>': 1, 'use</w>': 2, 'a </w>': 1, 'd i ver s e</w>': 1, 'r ang e</w>': 1, 'a p p l ication s</w>': 1, 'inclu ding</w>': 1, 't e x t</w>': 1, 'cl assi f ication ,</w>': 1, 'ma c h in e</w>': 1, 't r ans lation ,</w>': 1, 'and</w>': 3, 'q u e st ion </w>': 1, 'ans w e r ing.</w>': 1, 'T op ic s</w>': 1, 'b e</w>': 1, 'c o ver ed</w>': 1, 'inclu d e</w>': 1, 'par t - of - s p e ec h </w>': 1, 't ag g ing,</w>': 1, 'n - gram </w>': 1, 'language</w>': 2, 'm o de l l ing,</w>': 1, 's y nt a c t ic </w>': 1, 'par s ing</w>': 1, 'de e p </w>': 1, 'l e ar n ing.</w>': 1, 'pro gram m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h op s ,</w>': 1, 'assi g n me nts</w>': 1, 'in st al lation </w>': 1, 'at </w>': 1, 'h o me .</w>': 1}\n",
      "Iter: 83\n",
      "Best pair: ('devel', 'op')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 1, 'h': 4, 'e': 8, '</w>': 12, 'a': 3, 'i': 3, 'm': 4, 's': 7, 'f': 1, 'o': 3, 'r': 5, 't': 6, 'u': 3, 'b': 1, 'j': 0, 'c': 4, 'd': 2, 'n': 5, 'v': 0, 'l': 4, 'p': 4, 'g': 3, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 0, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 3, 'e</w>': 7, 'an': 2, 's</w>': 3, 'ing': 0, 'or': 3, 'on': 2, 'at': 2, ',</w>': 4, 'd</w>': 0, 'ion': 1, 'for': 1, 'th': 2, 'de': 3, 'ation': 1, 'for</w>': 4, 'st': 3, 'ing</w>': 2, 'in</w>': 4, 'us': 0, 'ang': 1, 'ag': 1, 'ic': 2, 'is</w>': 2, 'nt': 1, 've': 0, 'op': 2, 'of': 1, 'al': 3, 'ed</w>': 1, 'lang': 0, 'langu': 0, 'languag': 0, 'language</w>': 3, 'ss': 1, 'ing,</w>': 3, 'cl': 1, 'and</w>': 3, '.</w>': 1, 'ar': 1, 'Th': 0, 'The</w>': 2, 'ms</w>': 1, 'ec': 1, 't</w>': 1, 'nts</w>': 1, 'to': 0, 'to</w>': 2, 'ding</w>': 2, 'of</w>': 2, 'the</w>': 2, 'ma': 2, 'used</w>': 2, 'pr': 0, 'pro': 2, 'use</w>': 2, 'ver': 2, 'ication': 2, 'incl': 0, 'inclu': 2, 'ass': 0, 'assi': 2, 'ans': 2, 'lation': 2, 'ing.</w>': 2, 'par': 2, 'gr': 0, 'gra': 0, 'gram': 2, 'me': 2, 'ai': 0, 'aims</w>': 1, 'this</w>': 1, 'su': 0, 'sub': 0, 'subj': 0, 'subjec': 0, 'subject</w>': 1, 'stu': 0, 'stude': 0, 'students</w>': 1, 'deve': 0, 'devel': 0, 'develop': 1})\n",
      "Number of tokens: 115\n",
      "==========\n",
      "vocab,  {'The</w>': 2, 'aims</w>': 1, 'for</w>': 4, 'this</w>': 1, 'subject</w>': 1, 'is</w>': 2, 'students</w>': 1, 'to</w>': 2, 'develop </w>': 1, 'an </w>': 1, 'u n de r st an ding</w>': 1, 'of</w>': 2, 'the</w>': 2, 'ma in</w>': 1, 'al g or i th ms</w>': 1, 'used</w>': 2, 'in</w>': 3, 'n at u r al language</w>': 1, 'pro c e ss ing,</w>': 1, 'use</w>': 2, 'a </w>': 1, 'd i ver s e</w>': 1, 'r ang e</w>': 1, 'a p p l ication s</w>': 1, 'inclu ding</w>': 1, 't e x t</w>': 1, 'cl assi f ication ,</w>': 1, 'ma c h in e</w>': 1, 't r ans lation ,</w>': 1, 'and</w>': 3, 'q u e st ion </w>': 1, 'ans w e r ing.</w>': 1, 'T op ic s</w>': 1, 'b e</w>': 1, 'c o ver ed</w>': 1, 'inclu d e</w>': 1, 'par t - of - s p e ec h </w>': 1, 't ag g ing,</w>': 1, 'n - gram </w>': 1, 'language</w>': 2, 'm o de l l ing,</w>': 1, 's y nt a c t ic </w>': 1, 'par s ing</w>': 1, 'de e p </w>': 1, 'l e ar n ing.</w>': 1, 'pro gram m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h op s ,</w>': 1, 'assi g n me nts</w>': 1, 'in st al lation </w>': 1, 'at </w>': 1, 'h o me .</w>': 1}\n",
      "Iter: 84\n",
      "Best pair: ('develop', '</w>')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 1, 'h': 4, 'e': 8, '</w>': 11, 'a': 3, 'i': 3, 'm': 4, 's': 7, 'f': 1, 'o': 3, 'r': 5, 't': 6, 'u': 3, 'b': 1, 'j': 0, 'c': 4, 'd': 2, 'n': 5, 'v': 0, 'l': 4, 'p': 4, 'g': 3, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 0, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 3, 'e</w>': 7, 'an': 2, 's</w>': 3, 'ing': 0, 'or': 3, 'on': 2, 'at': 2, ',</w>': 4, 'd</w>': 0, 'ion': 1, 'for': 1, 'th': 2, 'de': 3, 'ation': 1, 'for</w>': 4, 'st': 3, 'ing</w>': 2, 'in</w>': 4, 'us': 0, 'ang': 1, 'ag': 1, 'ic': 2, 'is</w>': 2, 'nt': 1, 've': 0, 'op': 2, 'of': 1, 'al': 3, 'ed</w>': 1, 'lang': 0, 'langu': 0, 'languag': 0, 'language</w>': 3, 'ss': 1, 'ing,</w>': 3, 'cl': 1, 'and</w>': 3, '.</w>': 1, 'ar': 1, 'Th': 0, 'The</w>': 2, 'ms</w>': 1, 'ec': 1, 't</w>': 1, 'nts</w>': 1, 'to': 0, 'to</w>': 2, 'ding</w>': 2, 'of</w>': 2, 'the</w>': 2, 'ma': 2, 'used</w>': 2, 'pr': 0, 'pro': 2, 'use</w>': 2, 'ver': 2, 'ication': 2, 'incl': 0, 'inclu': 2, 'ass': 0, 'assi': 2, 'ans': 2, 'lation': 2, 'ing.</w>': 2, 'par': 2, 'gr': 0, 'gra': 0, 'gram': 2, 'me': 2, 'ai': 0, 'aims</w>': 1, 'this</w>': 1, 'su': 0, 'sub': 0, 'subj': 0, 'subjec': 0, 'subject</w>': 1, 'stu': 0, 'stude': 0, 'students</w>': 1, 'deve': 0, 'devel': 0, 'develop': 0, 'develop</w>': 1})\n",
      "Number of tokens: 116\n",
      "==========\n",
      "vocab,  {'The</w>': 2, 'aims</w>': 1, 'for</w>': 4, 'this</w>': 1, 'subject</w>': 1, 'is</w>': 2, 'students</w>': 1, 'to</w>': 2, 'develop</w>': 1, 'an </w>': 1, 'u n de r st an ding</w>': 1, 'of</w>': 2, 'the</w>': 2, 'ma in</w>': 1, 'al g or i th ms</w>': 1, 'used</w>': 2, 'in</w>': 3, 'n at u r al language</w>': 1, 'pro c e ss ing,</w>': 1, 'use</w>': 2, 'a </w>': 1, 'd i ver s e</w>': 1, 'r ang e</w>': 1, 'a p p l ication s</w>': 1, 'inclu ding</w>': 1, 't e x t</w>': 1, 'cl assi f ication ,</w>': 1, 'ma c h in e</w>': 1, 't r ans lation ,</w>': 1, 'and</w>': 3, 'q u e st ion </w>': 1, 'ans w e r ing.</w>': 1, 'T op ic s</w>': 1, 'b e</w>': 1, 'c o ver ed</w>': 1, 'inclu d e</w>': 1, 'par t - of - s p e ec h </w>': 1, 't ag g ing,</w>': 1, 'n - gram </w>': 1, 'language</w>': 2, 'm o de l l ing,</w>': 1, 's y nt a c t ic </w>': 1, 'par s ing</w>': 1, 'de e p </w>': 1, 'l e ar n ing.</w>': 1, 'pro gram m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h op s ,</w>': 1, 'assi g n me nts</w>': 1, 'in st al lation </w>': 1, 'at </w>': 1, 'h o me .</w>': 1}\n",
      "Iter: 85\n",
      "Best pair: ('an', '</w>')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 1, 'h': 4, 'e': 8, '</w>': 10, 'a': 3, 'i': 3, 'm': 4, 's': 7, 'f': 1, 'o': 3, 'r': 5, 't': 6, 'u': 3, 'b': 1, 'j': 0, 'c': 4, 'd': 2, 'n': 5, 'v': 0, 'l': 4, 'p': 4, 'g': 3, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 0, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 3, 'e</w>': 7, 'an': 1, 's</w>': 3, 'ing': 0, 'or': 3, 'on': 2, 'at': 2, ',</w>': 4, 'd</w>': 0, 'ion': 1, 'for': 1, 'th': 2, 'de': 3, 'ation': 1, 'for</w>': 4, 'st': 3, 'ing</w>': 2, 'in</w>': 4, 'us': 0, 'ang': 1, 'ag': 1, 'ic': 2, 'is</w>': 2, 'nt': 1, 've': 0, 'op': 2, 'of': 1, 'al': 3, 'ed</w>': 1, 'lang': 0, 'langu': 0, 'languag': 0, 'language</w>': 3, 'ss': 1, 'ing,</w>': 3, 'cl': 1, 'and</w>': 3, '.</w>': 1, 'ar': 1, 'Th': 0, 'The</w>': 2, 'ms</w>': 1, 'ec': 1, 't</w>': 1, 'nts</w>': 1, 'to': 0, 'to</w>': 2, 'ding</w>': 2, 'of</w>': 2, 'the</w>': 2, 'ma': 2, 'used</w>': 2, 'pr': 0, 'pro': 2, 'use</w>': 2, 'ver': 2, 'ication': 2, 'incl': 0, 'inclu': 2, 'ass': 0, 'assi': 2, 'ans': 2, 'lation': 2, 'ing.</w>': 2, 'par': 2, 'gr': 0, 'gra': 0, 'gram': 2, 'me': 2, 'ai': 0, 'aims</w>': 1, 'this</w>': 1, 'su': 0, 'sub': 0, 'subj': 0, 'subjec': 0, 'subject</w>': 1, 'stu': 0, 'stude': 0, 'students</w>': 1, 'deve': 0, 'devel': 0, 'develop': 0, 'develop</w>': 1, 'an</w>': 1})\n",
      "Number of tokens: 117\n",
      "==========\n",
      "vocab,  {'The</w>': 2, 'aims</w>': 1, 'for</w>': 4, 'this</w>': 1, 'subject</w>': 1, 'is</w>': 2, 'students</w>': 1, 'to</w>': 2, 'develop</w>': 1, 'an</w>': 1, 'u n de r st an ding</w>': 1, 'of</w>': 2, 'the</w>': 2, 'ma in</w>': 1, 'al g or i th ms</w>': 1, 'used</w>': 2, 'in</w>': 3, 'n at u r al language</w>': 1, 'pro c e ss ing,</w>': 1, 'use</w>': 2, 'a </w>': 1, 'd i ver s e</w>': 1, 'r ang e</w>': 1, 'a p p l ication s</w>': 1, 'inclu ding</w>': 1, 't e x t</w>': 1, 'cl assi f ication ,</w>': 1, 'ma c h in e</w>': 1, 't r ans lation ,</w>': 1, 'and</w>': 3, 'q u e st ion </w>': 1, 'ans w e r ing.</w>': 1, 'T op ic s</w>': 1, 'b e</w>': 1, 'c o ver ed</w>': 1, 'inclu d e</w>': 1, 'par t - of - s p e ec h </w>': 1, 't ag g ing,</w>': 1, 'n - gram </w>': 1, 'language</w>': 2, 'm o de l l ing,</w>': 1, 's y nt a c t ic </w>': 1, 'par s ing</w>': 1, 'de e p </w>': 1, 'l e ar n ing.</w>': 1, 'pro gram m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h op s ,</w>': 1, 'assi g n me nts</w>': 1, 'in st al lation </w>': 1, 'at </w>': 1, 'h o me .</w>': 1}\n",
      "Iter: 86\n",
      "Best pair: ('u', 'n')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 1, 'h': 4, 'e': 8, '</w>': 10, 'a': 3, 'i': 3, 'm': 4, 's': 7, 'f': 1, 'o': 3, 'r': 5, 't': 6, 'u': 2, 'b': 1, 'j': 0, 'c': 4, 'd': 2, 'n': 4, 'v': 0, 'l': 4, 'p': 4, 'g': 3, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 0, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 3, 'e</w>': 7, 'an': 1, 's</w>': 3, 'ing': 0, 'or': 3, 'on': 2, 'at': 2, ',</w>': 4, 'd</w>': 0, 'ion': 1, 'for': 1, 'th': 2, 'de': 3, 'ation': 1, 'for</w>': 4, 'st': 3, 'ing</w>': 2, 'in</w>': 4, 'us': 0, 'ang': 1, 'ag': 1, 'ic': 2, 'is</w>': 2, 'nt': 1, 've': 0, 'op': 2, 'of': 1, 'al': 3, 'ed</w>': 1, 'lang': 0, 'langu': 0, 'languag': 0, 'language</w>': 3, 'ss': 1, 'ing,</w>': 3, 'cl': 1, 'and</w>': 3, '.</w>': 1, 'ar': 1, 'Th': 0, 'The</w>': 2, 'ms</w>': 1, 'ec': 1, 't</w>': 1, 'nts</w>': 1, 'to': 0, 'to</w>': 2, 'ding</w>': 2, 'of</w>': 2, 'the</w>': 2, 'ma': 2, 'used</w>': 2, 'pr': 0, 'pro': 2, 'use</w>': 2, 'ver': 2, 'ication': 2, 'incl': 0, 'inclu': 2, 'ass': 0, 'assi': 2, 'ans': 2, 'lation': 2, 'ing.</w>': 2, 'par': 2, 'gr': 0, 'gra': 0, 'gram': 2, 'me': 2, 'ai': 0, 'aims</w>': 1, 'this</w>': 1, 'su': 0, 'sub': 0, 'subj': 0, 'subjec': 0, 'subject</w>': 1, 'stu': 0, 'stude': 0, 'students</w>': 1, 'deve': 0, 'devel': 0, 'develop': 0, 'develop</w>': 1, 'an</w>': 1, 'un': 1})\n",
      "Number of tokens: 118\n",
      "==========\n",
      "vocab,  {'The</w>': 2, 'aims</w>': 1, 'for</w>': 4, 'this</w>': 1, 'subject</w>': 1, 'is</w>': 2, 'students</w>': 1, 'to</w>': 2, 'develop</w>': 1, 'an</w>': 1, 'un de r st an ding</w>': 1, 'of</w>': 2, 'the</w>': 2, 'ma in</w>': 1, 'al g or i th ms</w>': 1, 'used</w>': 2, 'in</w>': 3, 'n at u r al language</w>': 1, 'pro c e ss ing,</w>': 1, 'use</w>': 2, 'a </w>': 1, 'd i ver s e</w>': 1, 'r ang e</w>': 1, 'a p p l ication s</w>': 1, 'inclu ding</w>': 1, 't e x t</w>': 1, 'cl assi f ication ,</w>': 1, 'ma c h in e</w>': 1, 't r ans lation ,</w>': 1, 'and</w>': 3, 'q u e st ion </w>': 1, 'ans w e r ing.</w>': 1, 'T op ic s</w>': 1, 'b e</w>': 1, 'c o ver ed</w>': 1, 'inclu d e</w>': 1, 'par t - of - s p e ec h </w>': 1, 't ag g ing,</w>': 1, 'n - gram </w>': 1, 'language</w>': 2, 'm o de l l ing,</w>': 1, 's y nt a c t ic </w>': 1, 'par s ing</w>': 1, 'de e p </w>': 1, 'l e ar n ing.</w>': 1, 'pro gram m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h op s ,</w>': 1, 'assi g n me nts</w>': 1, 'in st al lation </w>': 1, 'at </w>': 1, 'h o me .</w>': 1}\n",
      "Iter: 87\n",
      "Best pair: ('un', 'de')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 1, 'h': 4, 'e': 8, '</w>': 10, 'a': 3, 'i': 3, 'm': 4, 's': 7, 'f': 1, 'o': 3, 'r': 5, 't': 6, 'u': 2, 'b': 1, 'j': 0, 'c': 4, 'd': 2, 'n': 4, 'v': 0, 'l': 4, 'p': 4, 'g': 3, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 0, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 3, 'e</w>': 7, 'an': 1, 's</w>': 3, 'ing': 0, 'or': 3, 'on': 2, 'at': 2, ',</w>': 4, 'd</w>': 0, 'ion': 1, 'for': 1, 'th': 2, 'de': 2, 'ation': 1, 'for</w>': 4, 'st': 3, 'ing</w>': 2, 'in</w>': 4, 'us': 0, 'ang': 1, 'ag': 1, 'ic': 2, 'is</w>': 2, 'nt': 1, 've': 0, 'op': 2, 'of': 1, 'al': 3, 'ed</w>': 1, 'lang': 0, 'langu': 0, 'languag': 0, 'language</w>': 3, 'ss': 1, 'ing,</w>': 3, 'cl': 1, 'and</w>': 3, '.</w>': 1, 'ar': 1, 'Th': 0, 'The</w>': 2, 'ms</w>': 1, 'ec': 1, 't</w>': 1, 'nts</w>': 1, 'to': 0, 'to</w>': 2, 'ding</w>': 2, 'of</w>': 2, 'the</w>': 2, 'ma': 2, 'used</w>': 2, 'pr': 0, 'pro': 2, 'use</w>': 2, 'ver': 2, 'ication': 2, 'incl': 0, 'inclu': 2, 'ass': 0, 'assi': 2, 'ans': 2, 'lation': 2, 'ing.</w>': 2, 'par': 2, 'gr': 0, 'gra': 0, 'gram': 2, 'me': 2, 'ai': 0, 'aims</w>': 1, 'this</w>': 1, 'su': 0, 'sub': 0, 'subj': 0, 'subjec': 0, 'subject</w>': 1, 'stu': 0, 'stude': 0, 'students</w>': 1, 'deve': 0, 'devel': 0, 'develop': 0, 'develop</w>': 1, 'an</w>': 1, 'un': 0, 'unde': 1})\n",
      "Number of tokens: 119\n",
      "==========\n",
      "vocab,  {'The</w>': 2, 'aims</w>': 1, 'for</w>': 4, 'this</w>': 1, 'subject</w>': 1, 'is</w>': 2, 'students</w>': 1, 'to</w>': 2, 'develop</w>': 1, 'an</w>': 1, 'unde r st an ding</w>': 1, 'of</w>': 2, 'the</w>': 2, 'ma in</w>': 1, 'al g or i th ms</w>': 1, 'used</w>': 2, 'in</w>': 3, 'n at u r al language</w>': 1, 'pro c e ss ing,</w>': 1, 'use</w>': 2, 'a </w>': 1, 'd i ver s e</w>': 1, 'r ang e</w>': 1, 'a p p l ication s</w>': 1, 'inclu ding</w>': 1, 't e x t</w>': 1, 'cl assi f ication ,</w>': 1, 'ma c h in e</w>': 1, 't r ans lation ,</w>': 1, 'and</w>': 3, 'q u e st ion </w>': 1, 'ans w e r ing.</w>': 1, 'T op ic s</w>': 1, 'b e</w>': 1, 'c o ver ed</w>': 1, 'inclu d e</w>': 1, 'par t - of - s p e ec h </w>': 1, 't ag g ing,</w>': 1, 'n - gram </w>': 1, 'language</w>': 2, 'm o de l l ing,</w>': 1, 's y nt a c t ic </w>': 1, 'par s ing</w>': 1, 'de e p </w>': 1, 'l e ar n ing.</w>': 1, 'pro gram m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h op s ,</w>': 1, 'assi g n me nts</w>': 1, 'in st al lation </w>': 1, 'at </w>': 1, 'h o me .</w>': 1}\n",
      "Iter: 88\n",
      "Best pair: ('unde', 'r')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 1, 'h': 4, 'e': 8, '</w>': 10, 'a': 3, 'i': 3, 'm': 4, 's': 7, 'f': 1, 'o': 3, 'r': 4, 't': 6, 'u': 2, 'b': 1, 'j': 0, 'c': 4, 'd': 2, 'n': 4, 'v': 0, 'l': 4, 'p': 4, 'g': 3, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 0, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 3, 'e</w>': 7, 'an': 1, 's</w>': 3, 'ing': 0, 'or': 3, 'on': 2, 'at': 2, ',</w>': 4, 'd</w>': 0, 'ion': 1, 'for': 1, 'th': 2, 'de': 2, 'ation': 1, 'for</w>': 4, 'st': 3, 'ing</w>': 2, 'in</w>': 4, 'us': 0, 'ang': 1, 'ag': 1, 'ic': 2, 'is</w>': 2, 'nt': 1, 've': 0, 'op': 2, 'of': 1, 'al': 3, 'ed</w>': 1, 'lang': 0, 'langu': 0, 'languag': 0, 'language</w>': 3, 'ss': 1, 'ing,</w>': 3, 'cl': 1, 'and</w>': 3, '.</w>': 1, 'ar': 1, 'Th': 0, 'The</w>': 2, 'ms</w>': 1, 'ec': 1, 't</w>': 1, 'nts</w>': 1, 'to': 0, 'to</w>': 2, 'ding</w>': 2, 'of</w>': 2, 'the</w>': 2, 'ma': 2, 'used</w>': 2, 'pr': 0, 'pro': 2, 'use</w>': 2, 'ver': 2, 'ication': 2, 'incl': 0, 'inclu': 2, 'ass': 0, 'assi': 2, 'ans': 2, 'lation': 2, 'ing.</w>': 2, 'par': 2, 'gr': 0, 'gra': 0, 'gram': 2, 'me': 2, 'ai': 0, 'aims</w>': 1, 'this</w>': 1, 'su': 0, 'sub': 0, 'subj': 0, 'subjec': 0, 'subject</w>': 1, 'stu': 0, 'stude': 0, 'students</w>': 1, 'deve': 0, 'devel': 0, 'develop': 0, 'develop</w>': 1, 'an</w>': 1, 'un': 0, 'unde': 0, 'under': 1})\n",
      "Number of tokens: 120\n",
      "==========\n",
      "vocab,  {'The</w>': 2, 'aims</w>': 1, 'for</w>': 4, 'this</w>': 1, 'subject</w>': 1, 'is</w>': 2, 'students</w>': 1, 'to</w>': 2, 'develop</w>': 1, 'an</w>': 1, 'under st an ding</w>': 1, 'of</w>': 2, 'the</w>': 2, 'ma in</w>': 1, 'al g or i th ms</w>': 1, 'used</w>': 2, 'in</w>': 3, 'n at u r al language</w>': 1, 'pro c e ss ing,</w>': 1, 'use</w>': 2, 'a </w>': 1, 'd i ver s e</w>': 1, 'r ang e</w>': 1, 'a p p l ication s</w>': 1, 'inclu ding</w>': 1, 't e x t</w>': 1, 'cl assi f ication ,</w>': 1, 'ma c h in e</w>': 1, 't r ans lation ,</w>': 1, 'and</w>': 3, 'q u e st ion </w>': 1, 'ans w e r ing.</w>': 1, 'T op ic s</w>': 1, 'b e</w>': 1, 'c o ver ed</w>': 1, 'inclu d e</w>': 1, 'par t - of - s p e ec h </w>': 1, 't ag g ing,</w>': 1, 'n - gram </w>': 1, 'language</w>': 2, 'm o de l l ing,</w>': 1, 's y nt a c t ic </w>': 1, 'par s ing</w>': 1, 'de e p </w>': 1, 'l e ar n ing.</w>': 1, 'pro gram m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h op s ,</w>': 1, 'assi g n me nts</w>': 1, 'in st al lation </w>': 1, 'at </w>': 1, 'h o me .</w>': 1}\n",
      "Iter: 89\n",
      "Best pair: ('under', 'st')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 1, 'h': 4, 'e': 8, '</w>': 10, 'a': 3, 'i': 3, 'm': 4, 's': 7, 'f': 1, 'o': 3, 'r': 4, 't': 6, 'u': 2, 'b': 1, 'j': 0, 'c': 4, 'd': 2, 'n': 4, 'v': 0, 'l': 4, 'p': 4, 'g': 3, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 0, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 3, 'e</w>': 7, 'an': 1, 's</w>': 3, 'ing': 0, 'or': 3, 'on': 2, 'at': 2, ',</w>': 4, 'd</w>': 0, 'ion': 1, 'for': 1, 'th': 2, 'de': 2, 'ation': 1, 'for</w>': 4, 'st': 2, 'ing</w>': 2, 'in</w>': 4, 'us': 0, 'ang': 1, 'ag': 1, 'ic': 2, 'is</w>': 2, 'nt': 1, 've': 0, 'op': 2, 'of': 1, 'al': 3, 'ed</w>': 1, 'lang': 0, 'langu': 0, 'languag': 0, 'language</w>': 3, 'ss': 1, 'ing,</w>': 3, 'cl': 1, 'and</w>': 3, '.</w>': 1, 'ar': 1, 'Th': 0, 'The</w>': 2, 'ms</w>': 1, 'ec': 1, 't</w>': 1, 'nts</w>': 1, 'to': 0, 'to</w>': 2, 'ding</w>': 2, 'of</w>': 2, 'the</w>': 2, 'ma': 2, 'used</w>': 2, 'pr': 0, 'pro': 2, 'use</w>': 2, 'ver': 2, 'ication': 2, 'incl': 0, 'inclu': 2, 'ass': 0, 'assi': 2, 'ans': 2, 'lation': 2, 'ing.</w>': 2, 'par': 2, 'gr': 0, 'gra': 0, 'gram': 2, 'me': 2, 'ai': 0, 'aims</w>': 1, 'this</w>': 1, 'su': 0, 'sub': 0, 'subj': 0, 'subjec': 0, 'subject</w>': 1, 'stu': 0, 'stude': 0, 'students</w>': 1, 'deve': 0, 'devel': 0, 'develop': 0, 'develop</w>': 1, 'an</w>': 1, 'un': 0, 'unde': 0, 'under': 0, 'underst': 1})\n",
      "Number of tokens: 121\n",
      "==========\n",
      "vocab,  {'The</w>': 2, 'aims</w>': 1, 'for</w>': 4, 'this</w>': 1, 'subject</w>': 1, 'is</w>': 2, 'students</w>': 1, 'to</w>': 2, 'develop</w>': 1, 'an</w>': 1, 'underst an ding</w>': 1, 'of</w>': 2, 'the</w>': 2, 'ma in</w>': 1, 'al g or i th ms</w>': 1, 'used</w>': 2, 'in</w>': 3, 'n at u r al language</w>': 1, 'pro c e ss ing,</w>': 1, 'use</w>': 2, 'a </w>': 1, 'd i ver s e</w>': 1, 'r ang e</w>': 1, 'a p p l ication s</w>': 1, 'inclu ding</w>': 1, 't e x t</w>': 1, 'cl assi f ication ,</w>': 1, 'ma c h in e</w>': 1, 't r ans lation ,</w>': 1, 'and</w>': 3, 'q u e st ion </w>': 1, 'ans w e r ing.</w>': 1, 'T op ic s</w>': 1, 'b e</w>': 1, 'c o ver ed</w>': 1, 'inclu d e</w>': 1, 'par t - of - s p e ec h </w>': 1, 't ag g ing,</w>': 1, 'n - gram </w>': 1, 'language</w>': 2, 'm o de l l ing,</w>': 1, 's y nt a c t ic </w>': 1, 'par s ing</w>': 1, 'de e p </w>': 1, 'l e ar n ing.</w>': 1, 'pro gram m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h op s ,</w>': 1, 'assi g n me nts</w>': 1, 'in st al lation </w>': 1, 'at </w>': 1, 'h o me .</w>': 1}\n",
      "Iter: 90\n",
      "Best pair: ('underst', 'an')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 1, 'h': 4, 'e': 8, '</w>': 10, 'a': 3, 'i': 3, 'm': 4, 's': 7, 'f': 1, 'o': 3, 'r': 4, 't': 6, 'u': 2, 'b': 1, 'j': 0, 'c': 4, 'd': 2, 'n': 4, 'v': 0, 'l': 4, 'p': 4, 'g': 3, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 0, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 3, 'e</w>': 7, 'an': 0, 's</w>': 3, 'ing': 0, 'or': 3, 'on': 2, 'at': 2, ',</w>': 4, 'd</w>': 0, 'ion': 1, 'for': 1, 'th': 2, 'de': 2, 'ation': 1, 'for</w>': 4, 'st': 2, 'ing</w>': 2, 'in</w>': 4, 'us': 0, 'ang': 1, 'ag': 1, 'ic': 2, 'is</w>': 2, 'nt': 1, 've': 0, 'op': 2, 'of': 1, 'al': 3, 'ed</w>': 1, 'lang': 0, 'langu': 0, 'languag': 0, 'language</w>': 3, 'ss': 1, 'ing,</w>': 3, 'cl': 1, 'and</w>': 3, '.</w>': 1, 'ar': 1, 'Th': 0, 'The</w>': 2, 'ms</w>': 1, 'ec': 1, 't</w>': 1, 'nts</w>': 1, 'to': 0, 'to</w>': 2, 'ding</w>': 2, 'of</w>': 2, 'the</w>': 2, 'ma': 2, 'used</w>': 2, 'pr': 0, 'pro': 2, 'use</w>': 2, 'ver': 2, 'ication': 2, 'incl': 0, 'inclu': 2, 'ass': 0, 'assi': 2, 'ans': 2, 'lation': 2, 'ing.</w>': 2, 'par': 2, 'gr': 0, 'gra': 0, 'gram': 2, 'me': 2, 'ai': 0, 'aims</w>': 1, 'this</w>': 1, 'su': 0, 'sub': 0, 'subj': 0, 'subjec': 0, 'subject</w>': 1, 'stu': 0, 'stude': 0, 'students</w>': 1, 'deve': 0, 'devel': 0, 'develop': 0, 'develop</w>': 1, 'an</w>': 1, 'un': 0, 'unde': 0, 'under': 0, 'underst': 0, 'understan': 1})\n",
      "Number of tokens: 122\n",
      "==========\n",
      "vocab,  {'The</w>': 2, 'aims</w>': 1, 'for</w>': 4, 'this</w>': 1, 'subject</w>': 1, 'is</w>': 2, 'students</w>': 1, 'to</w>': 2, 'develop</w>': 1, 'an</w>': 1, 'understan ding</w>': 1, 'of</w>': 2, 'the</w>': 2, 'ma in</w>': 1, 'al g or i th ms</w>': 1, 'used</w>': 2, 'in</w>': 3, 'n at u r al language</w>': 1, 'pro c e ss ing,</w>': 1, 'use</w>': 2, 'a </w>': 1, 'd i ver s e</w>': 1, 'r ang e</w>': 1, 'a p p l ication s</w>': 1, 'inclu ding</w>': 1, 't e x t</w>': 1, 'cl assi f ication ,</w>': 1, 'ma c h in e</w>': 1, 't r ans lation ,</w>': 1, 'and</w>': 3, 'q u e st ion </w>': 1, 'ans w e r ing.</w>': 1, 'T op ic s</w>': 1, 'b e</w>': 1, 'c o ver ed</w>': 1, 'inclu d e</w>': 1, 'par t - of - s p e ec h </w>': 1, 't ag g ing,</w>': 1, 'n - gram </w>': 1, 'language</w>': 2, 'm o de l l ing,</w>': 1, 's y nt a c t ic </w>': 1, 'par s ing</w>': 1, 'de e p </w>': 1, 'l e ar n ing.</w>': 1, 'pro gram m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h op s ,</w>': 1, 'assi g n me nts</w>': 1, 'in st al lation </w>': 1, 'at </w>': 1, 'h o me .</w>': 1}\n",
      "Iter: 91\n",
      "Best pair: ('understan', 'ding</w>')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 1, 'h': 4, 'e': 8, '</w>': 10, 'a': 3, 'i': 3, 'm': 4, 's': 7, 'f': 1, 'o': 3, 'r': 4, 't': 6, 'u': 2, 'b': 1, 'j': 0, 'c': 4, 'd': 2, 'n': 4, 'v': 0, 'l': 4, 'p': 4, 'g': 3, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 0, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 3, 'e</w>': 7, 'an': 0, 's</w>': 3, 'ing': 0, 'or': 3, 'on': 2, 'at': 2, ',</w>': 4, 'd</w>': 0, 'ion': 1, 'for': 1, 'th': 2, 'de': 2, 'ation': 1, 'for</w>': 4, 'st': 2, 'ing</w>': 2, 'in</w>': 4, 'us': 0, 'ang': 1, 'ag': 1, 'ic': 2, 'is</w>': 2, 'nt': 1, 've': 0, 'op': 2, 'of': 1, 'al': 3, 'ed</w>': 1, 'lang': 0, 'langu': 0, 'languag': 0, 'language</w>': 3, 'ss': 1, 'ing,</w>': 3, 'cl': 1, 'and</w>': 3, '.</w>': 1, 'ar': 1, 'Th': 0, 'The</w>': 2, 'ms</w>': 1, 'ec': 1, 't</w>': 1, 'nts</w>': 1, 'to': 0, 'to</w>': 2, 'ding</w>': 1, 'of</w>': 2, 'the</w>': 2, 'ma': 2, 'used</w>': 2, 'pr': 0, 'pro': 2, 'use</w>': 2, 'ver': 2, 'ication': 2, 'incl': 0, 'inclu': 2, 'ass': 0, 'assi': 2, 'ans': 2, 'lation': 2, 'ing.</w>': 2, 'par': 2, 'gr': 0, 'gra': 0, 'gram': 2, 'me': 2, 'ai': 0, 'aims</w>': 1, 'this</w>': 1, 'su': 0, 'sub': 0, 'subj': 0, 'subjec': 0, 'subject</w>': 1, 'stu': 0, 'stude': 0, 'students</w>': 1, 'deve': 0, 'devel': 0, 'develop': 0, 'develop</w>': 1, 'an</w>': 1, 'un': 0, 'unde': 0, 'under': 0, 'underst': 0, 'understan': 0, 'understanding</w>': 1})\n",
      "Number of tokens: 123\n",
      "==========\n",
      "vocab,  {'The</w>': 2, 'aims</w>': 1, 'for</w>': 4, 'this</w>': 1, 'subject</w>': 1, 'is</w>': 2, 'students</w>': 1, 'to</w>': 2, 'develop</w>': 1, 'an</w>': 1, 'understanding</w>': 1, 'of</w>': 2, 'the</w>': 2, 'ma in</w>': 1, 'al g or i th ms</w>': 1, 'used</w>': 2, 'in</w>': 3, 'n at u r al language</w>': 1, 'pro c e ss ing,</w>': 1, 'use</w>': 2, 'a </w>': 1, 'd i ver s e</w>': 1, 'r ang e</w>': 1, 'a p p l ication s</w>': 1, 'inclu ding</w>': 1, 't e x t</w>': 1, 'cl assi f ication ,</w>': 1, 'ma c h in e</w>': 1, 't r ans lation ,</w>': 1, 'and</w>': 3, 'q u e st ion </w>': 1, 'ans w e r ing.</w>': 1, 'T op ic s</w>': 1, 'b e</w>': 1, 'c o ver ed</w>': 1, 'inclu d e</w>': 1, 'par t - of - s p e ec h </w>': 1, 't ag g ing,</w>': 1, 'n - gram </w>': 1, 'language</w>': 2, 'm o de l l ing,</w>': 1, 's y nt a c t ic </w>': 1, 'par s ing</w>': 1, 'de e p </w>': 1, 'l e ar n ing.</w>': 1, 'pro gram m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h op s ,</w>': 1, 'assi g n me nts</w>': 1, 'in st al lation </w>': 1, 'at </w>': 1, 'h o me .</w>': 1}\n",
      "Iter: 92\n",
      "Best pair: ('ma', 'in</w>')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 1, 'h': 4, 'e': 8, '</w>': 10, 'a': 3, 'i': 3, 'm': 4, 's': 7, 'f': 1, 'o': 3, 'r': 4, 't': 6, 'u': 2, 'b': 1, 'j': 0, 'c': 4, 'd': 2, 'n': 4, 'v': 0, 'l': 4, 'p': 4, 'g': 3, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 0, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 3, 'e</w>': 7, 'an': 0, 's</w>': 3, 'ing': 0, 'or': 3, 'on': 2, 'at': 2, ',</w>': 4, 'd</w>': 0, 'ion': 1, 'for': 1, 'th': 2, 'de': 2, 'ation': 1, 'for</w>': 4, 'st': 2, 'ing</w>': 2, 'in</w>': 3, 'us': 0, 'ang': 1, 'ag': 1, 'ic': 2, 'is</w>': 2, 'nt': 1, 've': 0, 'op': 2, 'of': 1, 'al': 3, 'ed</w>': 1, 'lang': 0, 'langu': 0, 'languag': 0, 'language</w>': 3, 'ss': 1, 'ing,</w>': 3, 'cl': 1, 'and</w>': 3, '.</w>': 1, 'ar': 1, 'Th': 0, 'The</w>': 2, 'ms</w>': 1, 'ec': 1, 't</w>': 1, 'nts</w>': 1, 'to': 0, 'to</w>': 2, 'ding</w>': 1, 'of</w>': 2, 'the</w>': 2, 'ma': 1, 'used</w>': 2, 'pr': 0, 'pro': 2, 'use</w>': 2, 'ver': 2, 'ication': 2, 'incl': 0, 'inclu': 2, 'ass': 0, 'assi': 2, 'ans': 2, 'lation': 2, 'ing.</w>': 2, 'par': 2, 'gr': 0, 'gra': 0, 'gram': 2, 'me': 2, 'ai': 0, 'aims</w>': 1, 'this</w>': 1, 'su': 0, 'sub': 0, 'subj': 0, 'subjec': 0, 'subject</w>': 1, 'stu': 0, 'stude': 0, 'students</w>': 1, 'deve': 0, 'devel': 0, 'develop': 0, 'develop</w>': 1, 'an</w>': 1, 'un': 0, 'unde': 0, 'under': 0, 'underst': 0, 'understan': 0, 'understanding</w>': 1, 'main</w>': 1})\n",
      "Number of tokens: 124\n",
      "==========\n",
      "vocab,  {'The</w>': 2, 'aims</w>': 1, 'for</w>': 4, 'this</w>': 1, 'subject</w>': 1, 'is</w>': 2, 'students</w>': 1, 'to</w>': 2, 'develop</w>': 1, 'an</w>': 1, 'understanding</w>': 1, 'of</w>': 2, 'the</w>': 2, 'main</w>': 1, 'al g or i th ms</w>': 1, 'used</w>': 2, 'in</w>': 3, 'n at u r al language</w>': 1, 'pro c e ss ing,</w>': 1, 'use</w>': 2, 'a </w>': 1, 'd i ver s e</w>': 1, 'r ang e</w>': 1, 'a p p l ication s</w>': 1, 'inclu ding</w>': 1, 't e x t</w>': 1, 'cl assi f ication ,</w>': 1, 'ma c h in e</w>': 1, 't r ans lation ,</w>': 1, 'and</w>': 3, 'q u e st ion </w>': 1, 'ans w e r ing.</w>': 1, 'T op ic s</w>': 1, 'b e</w>': 1, 'c o ver ed</w>': 1, 'inclu d e</w>': 1, 'par t - of - s p e ec h </w>': 1, 't ag g ing,</w>': 1, 'n - gram </w>': 1, 'language</w>': 2, 'm o de l l ing,</w>': 1, 's y nt a c t ic </w>': 1, 'par s ing</w>': 1, 'de e p </w>': 1, 'l e ar n ing.</w>': 1, 'pro gram m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h op s ,</w>': 1, 'assi g n me nts</w>': 1, 'in st al lation </w>': 1, 'at </w>': 1, 'h o me .</w>': 1}\n",
      "Iter: 93\n",
      "Best pair: ('al', 'g')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 1, 'h': 4, 'e': 8, '</w>': 10, 'a': 3, 'i': 3, 'm': 4, 's': 7, 'f': 1, 'o': 3, 'r': 4, 't': 6, 'u': 2, 'b': 1, 'j': 0, 'c': 4, 'd': 2, 'n': 4, 'v': 0, 'l': 4, 'p': 4, 'g': 2, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 0, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 3, 'e</w>': 7, 'an': 0, 's</w>': 3, 'ing': 0, 'or': 3, 'on': 2, 'at': 2, ',</w>': 4, 'd</w>': 0, 'ion': 1, 'for': 1, 'th': 2, 'de': 2, 'ation': 1, 'for</w>': 4, 'st': 2, 'ing</w>': 2, 'in</w>': 3, 'us': 0, 'ang': 1, 'ag': 1, 'ic': 2, 'is</w>': 2, 'nt': 1, 've': 0, 'op': 2, 'of': 1, 'al': 2, 'ed</w>': 1, 'lang': 0, 'langu': 0, 'languag': 0, 'language</w>': 3, 'ss': 1, 'ing,</w>': 3, 'cl': 1, 'and</w>': 3, '.</w>': 1, 'ar': 1, 'Th': 0, 'The</w>': 2, 'ms</w>': 1, 'ec': 1, 't</w>': 1, 'nts</w>': 1, 'to': 0, 'to</w>': 2, 'ding</w>': 1, 'of</w>': 2, 'the</w>': 2, 'ma': 1, 'used</w>': 2, 'pr': 0, 'pro': 2, 'use</w>': 2, 'ver': 2, 'ication': 2, 'incl': 0, 'inclu': 2, 'ass': 0, 'assi': 2, 'ans': 2, 'lation': 2, 'ing.</w>': 2, 'par': 2, 'gr': 0, 'gra': 0, 'gram': 2, 'me': 2, 'ai': 0, 'aims</w>': 1, 'this</w>': 1, 'su': 0, 'sub': 0, 'subj': 0, 'subjec': 0, 'subject</w>': 1, 'stu': 0, 'stude': 0, 'students</w>': 1, 'deve': 0, 'devel': 0, 'develop': 0, 'develop</w>': 1, 'an</w>': 1, 'un': 0, 'unde': 0, 'under': 0, 'underst': 0, 'understan': 0, 'understanding</w>': 1, 'main</w>': 1, 'alg': 1})\n",
      "Number of tokens: 125\n",
      "==========\n",
      "vocab,  {'The</w>': 2, 'aims</w>': 1, 'for</w>': 4, 'this</w>': 1, 'subject</w>': 1, 'is</w>': 2, 'students</w>': 1, 'to</w>': 2, 'develop</w>': 1, 'an</w>': 1, 'understanding</w>': 1, 'of</w>': 2, 'the</w>': 2, 'main</w>': 1, 'alg or i th ms</w>': 1, 'used</w>': 2, 'in</w>': 3, 'n at u r al language</w>': 1, 'pro c e ss ing,</w>': 1, 'use</w>': 2, 'a </w>': 1, 'd i ver s e</w>': 1, 'r ang e</w>': 1, 'a p p l ication s</w>': 1, 'inclu ding</w>': 1, 't e x t</w>': 1, 'cl assi f ication ,</w>': 1, 'ma c h in e</w>': 1, 't r ans lation ,</w>': 1, 'and</w>': 3, 'q u e st ion </w>': 1, 'ans w e r ing.</w>': 1, 'T op ic s</w>': 1, 'b e</w>': 1, 'c o ver ed</w>': 1, 'inclu d e</w>': 1, 'par t - of - s p e ec h </w>': 1, 't ag g ing,</w>': 1, 'n - gram </w>': 1, 'language</w>': 2, 'm o de l l ing,</w>': 1, 's y nt a c t ic </w>': 1, 'par s ing</w>': 1, 'de e p </w>': 1, 'l e ar n ing.</w>': 1, 'pro gram m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h op s ,</w>': 1, 'assi g n me nts</w>': 1, 'in st al lation </w>': 1, 'at </w>': 1, 'h o me .</w>': 1}\n",
      "Iter: 94\n",
      "Best pair: ('alg', 'or')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 1, 'h': 4, 'e': 8, '</w>': 10, 'a': 3, 'i': 3, 'm': 4, 's': 7, 'f': 1, 'o': 3, 'r': 4, 't': 6, 'u': 2, 'b': 1, 'j': 0, 'c': 4, 'd': 2, 'n': 4, 'v': 0, 'l': 4, 'p': 4, 'g': 2, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 0, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 3, 'e</w>': 7, 'an': 0, 's</w>': 3, 'ing': 0, 'or': 2, 'on': 2, 'at': 2, ',</w>': 4, 'd</w>': 0, 'ion': 1, 'for': 1, 'th': 2, 'de': 2, 'ation': 1, 'for</w>': 4, 'st': 2, 'ing</w>': 2, 'in</w>': 3, 'us': 0, 'ang': 1, 'ag': 1, 'ic': 2, 'is</w>': 2, 'nt': 1, 've': 0, 'op': 2, 'of': 1, 'al': 2, 'ed</w>': 1, 'lang': 0, 'langu': 0, 'languag': 0, 'language</w>': 3, 'ss': 1, 'ing,</w>': 3, 'cl': 1, 'and</w>': 3, '.</w>': 1, 'ar': 1, 'Th': 0, 'The</w>': 2, 'ms</w>': 1, 'ec': 1, 't</w>': 1, 'nts</w>': 1, 'to': 0, 'to</w>': 2, 'ding</w>': 1, 'of</w>': 2, 'the</w>': 2, 'ma': 1, 'used</w>': 2, 'pr': 0, 'pro': 2, 'use</w>': 2, 'ver': 2, 'ication': 2, 'incl': 0, 'inclu': 2, 'ass': 0, 'assi': 2, 'ans': 2, 'lation': 2, 'ing.</w>': 2, 'par': 2, 'gr': 0, 'gra': 0, 'gram': 2, 'me': 2, 'ai': 0, 'aims</w>': 1, 'this</w>': 1, 'su': 0, 'sub': 0, 'subj': 0, 'subjec': 0, 'subject</w>': 1, 'stu': 0, 'stude': 0, 'students</w>': 1, 'deve': 0, 'devel': 0, 'develop': 0, 'develop</w>': 1, 'an</w>': 1, 'un': 0, 'unde': 0, 'under': 0, 'underst': 0, 'understan': 0, 'understanding</w>': 1, 'main</w>': 1, 'alg': 0, 'algor': 1})\n",
      "Number of tokens: 126\n",
      "==========\n",
      "vocab,  {'The</w>': 2, 'aims</w>': 1, 'for</w>': 4, 'this</w>': 1, 'subject</w>': 1, 'is</w>': 2, 'students</w>': 1, 'to</w>': 2, 'develop</w>': 1, 'an</w>': 1, 'understanding</w>': 1, 'of</w>': 2, 'the</w>': 2, 'main</w>': 1, 'algor i th ms</w>': 1, 'used</w>': 2, 'in</w>': 3, 'n at u r al language</w>': 1, 'pro c e ss ing,</w>': 1, 'use</w>': 2, 'a </w>': 1, 'd i ver s e</w>': 1, 'r ang e</w>': 1, 'a p p l ication s</w>': 1, 'inclu ding</w>': 1, 't e x t</w>': 1, 'cl assi f ication ,</w>': 1, 'ma c h in e</w>': 1, 't r ans lation ,</w>': 1, 'and</w>': 3, 'q u e st ion </w>': 1, 'ans w e r ing.</w>': 1, 'T op ic s</w>': 1, 'b e</w>': 1, 'c o ver ed</w>': 1, 'inclu d e</w>': 1, 'par t - of - s p e ec h </w>': 1, 't ag g ing,</w>': 1, 'n - gram </w>': 1, 'language</w>': 2, 'm o de l l ing,</w>': 1, 's y nt a c t ic </w>': 1, 'par s ing</w>': 1, 'de e p </w>': 1, 'l e ar n ing.</w>': 1, 'pro gram m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h op s ,</w>': 1, 'assi g n me nts</w>': 1, 'in st al lation </w>': 1, 'at </w>': 1, 'h o me .</w>': 1}\n",
      "Iter: 95\n",
      "Best pair: ('algor', 'i')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 1, 'h': 4, 'e': 8, '</w>': 10, 'a': 3, 'i': 2, 'm': 4, 's': 7, 'f': 1, 'o': 3, 'r': 4, 't': 6, 'u': 2, 'b': 1, 'j': 0, 'c': 4, 'd': 2, 'n': 4, 'v': 0, 'l': 4, 'p': 4, 'g': 2, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 0, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 3, 'e</w>': 7, 'an': 0, 's</w>': 3, 'ing': 0, 'or': 2, 'on': 2, 'at': 2, ',</w>': 4, 'd</w>': 0, 'ion': 1, 'for': 1, 'th': 2, 'de': 2, 'ation': 1, 'for</w>': 4, 'st': 2, 'ing</w>': 2, 'in</w>': 3, 'us': 0, 'ang': 1, 'ag': 1, 'ic': 2, 'is</w>': 2, 'nt': 1, 've': 0, 'op': 2, 'of': 1, 'al': 2, 'ed</w>': 1, 'lang': 0, 'langu': 0, 'languag': 0, 'language</w>': 3, 'ss': 1, 'ing,</w>': 3, 'cl': 1, 'and</w>': 3, '.</w>': 1, 'ar': 1, 'Th': 0, 'The</w>': 2, 'ms</w>': 1, 'ec': 1, 't</w>': 1, 'nts</w>': 1, 'to': 0, 'to</w>': 2, 'ding</w>': 1, 'of</w>': 2, 'the</w>': 2, 'ma': 1, 'used</w>': 2, 'pr': 0, 'pro': 2, 'use</w>': 2, 'ver': 2, 'ication': 2, 'incl': 0, 'inclu': 2, 'ass': 0, 'assi': 2, 'ans': 2, 'lation': 2, 'ing.</w>': 2, 'par': 2, 'gr': 0, 'gra': 0, 'gram': 2, 'me': 2, 'ai': 0, 'aims</w>': 1, 'this</w>': 1, 'su': 0, 'sub': 0, 'subj': 0, 'subjec': 0, 'subject</w>': 1, 'stu': 0, 'stude': 0, 'students</w>': 1, 'deve': 0, 'devel': 0, 'develop': 0, 'develop</w>': 1, 'an</w>': 1, 'un': 0, 'unde': 0, 'under': 0, 'underst': 0, 'understan': 0, 'understanding</w>': 1, 'main</w>': 1, 'alg': 0, 'algor': 0, 'algori': 1})\n",
      "Number of tokens: 127\n",
      "==========\n",
      "vocab,  {'The</w>': 2, 'aims</w>': 1, 'for</w>': 4, 'this</w>': 1, 'subject</w>': 1, 'is</w>': 2, 'students</w>': 1, 'to</w>': 2, 'develop</w>': 1, 'an</w>': 1, 'understanding</w>': 1, 'of</w>': 2, 'the</w>': 2, 'main</w>': 1, 'algori th ms</w>': 1, 'used</w>': 2, 'in</w>': 3, 'n at u r al language</w>': 1, 'pro c e ss ing,</w>': 1, 'use</w>': 2, 'a </w>': 1, 'd i ver s e</w>': 1, 'r ang e</w>': 1, 'a p p l ication s</w>': 1, 'inclu ding</w>': 1, 't e x t</w>': 1, 'cl assi f ication ,</w>': 1, 'ma c h in e</w>': 1, 't r ans lation ,</w>': 1, 'and</w>': 3, 'q u e st ion </w>': 1, 'ans w e r ing.</w>': 1, 'T op ic s</w>': 1, 'b e</w>': 1, 'c o ver ed</w>': 1, 'inclu d e</w>': 1, 'par t - of - s p e ec h </w>': 1, 't ag g ing,</w>': 1, 'n - gram </w>': 1, 'language</w>': 2, 'm o de l l ing,</w>': 1, 's y nt a c t ic </w>': 1, 'par s ing</w>': 1, 'de e p </w>': 1, 'l e ar n ing.</w>': 1, 'pro gram m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h op s ,</w>': 1, 'assi g n me nts</w>': 1, 'in st al lation </w>': 1, 'at </w>': 1, 'h o me .</w>': 1}\n",
      "Iter: 96\n",
      "Best pair: ('algori', 'th')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 1, 'h': 4, 'e': 8, '</w>': 10, 'a': 3, 'i': 2, 'm': 4, 's': 7, 'f': 1, 'o': 3, 'r': 4, 't': 6, 'u': 2, 'b': 1, 'j': 0, 'c': 4, 'd': 2, 'n': 4, 'v': 0, 'l': 4, 'p': 4, 'g': 2, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 0, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 3, 'e</w>': 7, 'an': 0, 's</w>': 3, 'ing': 0, 'or': 2, 'on': 2, 'at': 2, ',</w>': 4, 'd</w>': 0, 'ion': 1, 'for': 1, 'th': 1, 'de': 2, 'ation': 1, 'for</w>': 4, 'st': 2, 'ing</w>': 2, 'in</w>': 3, 'us': 0, 'ang': 1, 'ag': 1, 'ic': 2, 'is</w>': 2, 'nt': 1, 've': 0, 'op': 2, 'of': 1, 'al': 2, 'ed</w>': 1, 'lang': 0, 'langu': 0, 'languag': 0, 'language</w>': 3, 'ss': 1, 'ing,</w>': 3, 'cl': 1, 'and</w>': 3, '.</w>': 1, 'ar': 1, 'Th': 0, 'The</w>': 2, 'ms</w>': 1, 'ec': 1, 't</w>': 1, 'nts</w>': 1, 'to': 0, 'to</w>': 2, 'ding</w>': 1, 'of</w>': 2, 'the</w>': 2, 'ma': 1, 'used</w>': 2, 'pr': 0, 'pro': 2, 'use</w>': 2, 'ver': 2, 'ication': 2, 'incl': 0, 'inclu': 2, 'ass': 0, 'assi': 2, 'ans': 2, 'lation': 2, 'ing.</w>': 2, 'par': 2, 'gr': 0, 'gra': 0, 'gram': 2, 'me': 2, 'ai': 0, 'aims</w>': 1, 'this</w>': 1, 'su': 0, 'sub': 0, 'subj': 0, 'subjec': 0, 'subject</w>': 1, 'stu': 0, 'stude': 0, 'students</w>': 1, 'deve': 0, 'devel': 0, 'develop': 0, 'develop</w>': 1, 'an</w>': 1, 'un': 0, 'unde': 0, 'under': 0, 'underst': 0, 'understan': 0, 'understanding</w>': 1, 'main</w>': 1, 'alg': 0, 'algor': 0, 'algori': 0, 'algorith': 1})\n",
      "Number of tokens: 128\n",
      "==========\n",
      "vocab,  {'The</w>': 2, 'aims</w>': 1, 'for</w>': 4, 'this</w>': 1, 'subject</w>': 1, 'is</w>': 2, 'students</w>': 1, 'to</w>': 2, 'develop</w>': 1, 'an</w>': 1, 'understanding</w>': 1, 'of</w>': 2, 'the</w>': 2, 'main</w>': 1, 'algorith ms</w>': 1, 'used</w>': 2, 'in</w>': 3, 'n at u r al language</w>': 1, 'pro c e ss ing,</w>': 1, 'use</w>': 2, 'a </w>': 1, 'd i ver s e</w>': 1, 'r ang e</w>': 1, 'a p p l ication s</w>': 1, 'inclu ding</w>': 1, 't e x t</w>': 1, 'cl assi f ication ,</w>': 1, 'ma c h in e</w>': 1, 't r ans lation ,</w>': 1, 'and</w>': 3, 'q u e st ion </w>': 1, 'ans w e r ing.</w>': 1, 'T op ic s</w>': 1, 'b e</w>': 1, 'c o ver ed</w>': 1, 'inclu d e</w>': 1, 'par t - of - s p e ec h </w>': 1, 't ag g ing,</w>': 1, 'n - gram </w>': 1, 'language</w>': 2, 'm o de l l ing,</w>': 1, 's y nt a c t ic </w>': 1, 'par s ing</w>': 1, 'de e p </w>': 1, 'l e ar n ing.</w>': 1, 'pro gram m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h op s ,</w>': 1, 'assi g n me nts</w>': 1, 'in st al lation </w>': 1, 'at </w>': 1, 'h o me .</w>': 1}\n",
      "Iter: 97\n",
      "Best pair: ('algorith', 'ms</w>')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 1, 'h': 4, 'e': 8, '</w>': 10, 'a': 3, 'i': 2, 'm': 4, 's': 7, 'f': 1, 'o': 3, 'r': 4, 't': 6, 'u': 2, 'b': 1, 'j': 0, 'c': 4, 'd': 2, 'n': 4, 'v': 0, 'l': 4, 'p': 4, 'g': 2, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 0, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 3, 'e</w>': 7, 'an': 0, 's</w>': 3, 'ing': 0, 'or': 2, 'on': 2, 'at': 2, ',</w>': 4, 'd</w>': 0, 'ion': 1, 'for': 1, 'th': 1, 'de': 2, 'ation': 1, 'for</w>': 4, 'st': 2, 'ing</w>': 2, 'in</w>': 3, 'us': 0, 'ang': 1, 'ag': 1, 'ic': 2, 'is</w>': 2, 'nt': 1, 've': 0, 'op': 2, 'of': 1, 'al': 2, 'ed</w>': 1, 'lang': 0, 'langu': 0, 'languag': 0, 'language</w>': 3, 'ss': 1, 'ing,</w>': 3, 'cl': 1, 'and</w>': 3, '.</w>': 1, 'ar': 1, 'Th': 0, 'The</w>': 2, 'ms</w>': 0, 'ec': 1, 't</w>': 1, 'nts</w>': 1, 'to': 0, 'to</w>': 2, 'ding</w>': 1, 'of</w>': 2, 'the</w>': 2, 'ma': 1, 'used</w>': 2, 'pr': 0, 'pro': 2, 'use</w>': 2, 'ver': 2, 'ication': 2, 'incl': 0, 'inclu': 2, 'ass': 0, 'assi': 2, 'ans': 2, 'lation': 2, 'ing.</w>': 2, 'par': 2, 'gr': 0, 'gra': 0, 'gram': 2, 'me': 2, 'ai': 0, 'aims</w>': 1, 'this</w>': 1, 'su': 0, 'sub': 0, 'subj': 0, 'subjec': 0, 'subject</w>': 1, 'stu': 0, 'stude': 0, 'students</w>': 1, 'deve': 0, 'devel': 0, 'develop': 0, 'develop</w>': 1, 'an</w>': 1, 'un': 0, 'unde': 0, 'under': 0, 'underst': 0, 'understan': 0, 'understanding</w>': 1, 'main</w>': 1, 'alg': 0, 'algor': 0, 'algori': 0, 'algorith': 0, 'algorithms</w>': 1})\n",
      "Number of tokens: 129\n",
      "==========\n",
      "vocab,  {'The</w>': 2, 'aims</w>': 1, 'for</w>': 4, 'this</w>': 1, 'subject</w>': 1, 'is</w>': 2, 'students</w>': 1, 'to</w>': 2, 'develop</w>': 1, 'an</w>': 1, 'understanding</w>': 1, 'of</w>': 2, 'the</w>': 2, 'main</w>': 1, 'algorithms</w>': 1, 'used</w>': 2, 'in</w>': 3, 'n at u r al language</w>': 1, 'pro c e ss ing,</w>': 1, 'use</w>': 2, 'a </w>': 1, 'd i ver s e</w>': 1, 'r ang e</w>': 1, 'a p p l ication s</w>': 1, 'inclu ding</w>': 1, 't e x t</w>': 1, 'cl assi f ication ,</w>': 1, 'ma c h in e</w>': 1, 't r ans lation ,</w>': 1, 'and</w>': 3, 'q u e st ion </w>': 1, 'ans w e r ing.</w>': 1, 'T op ic s</w>': 1, 'b e</w>': 1, 'c o ver ed</w>': 1, 'inclu d e</w>': 1, 'par t - of - s p e ec h </w>': 1, 't ag g ing,</w>': 1, 'n - gram </w>': 1, 'language</w>': 2, 'm o de l l ing,</w>': 1, 's y nt a c t ic </w>': 1, 'par s ing</w>': 1, 'de e p </w>': 1, 'l e ar n ing.</w>': 1, 'pro gram m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h op s ,</w>': 1, 'assi g n me nts</w>': 1, 'in st al lation </w>': 1, 'at </w>': 1, 'h o me .</w>': 1}\n",
      "Iter: 98\n",
      "Best pair: ('n', 'at')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 1, 'h': 4, 'e': 8, '</w>': 10, 'a': 3, 'i': 2, 'm': 4, 's': 7, 'f': 1, 'o': 3, 'r': 4, 't': 6, 'u': 2, 'b': 1, 'j': 0, 'c': 4, 'd': 2, 'n': 3, 'v': 0, 'l': 4, 'p': 4, 'g': 2, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 0, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 3, 'e</w>': 7, 'an': 0, 's</w>': 3, 'ing': 0, 'or': 2, 'on': 2, 'at': 1, ',</w>': 4, 'd</w>': 0, 'ion': 1, 'for': 1, 'th': 1, 'de': 2, 'ation': 1, 'for</w>': 4, 'st': 2, 'ing</w>': 2, 'in</w>': 3, 'us': 0, 'ang': 1, 'ag': 1, 'ic': 2, 'is</w>': 2, 'nt': 1, 've': 0, 'op': 2, 'of': 1, 'al': 2, 'ed</w>': 1, 'lang': 0, 'langu': 0, 'languag': 0, 'language</w>': 3, 'ss': 1, 'ing,</w>': 3, 'cl': 1, 'and</w>': 3, '.</w>': 1, 'ar': 1, 'Th': 0, 'The</w>': 2, 'ms</w>': 0, 'ec': 1, 't</w>': 1, 'nts</w>': 1, 'to': 0, 'to</w>': 2, 'ding</w>': 1, 'of</w>': 2, 'the</w>': 2, 'ma': 1, 'used</w>': 2, 'pr': 0, 'pro': 2, 'use</w>': 2, 'ver': 2, 'ication': 2, 'incl': 0, 'inclu': 2, 'ass': 0, 'assi': 2, 'ans': 2, 'lation': 2, 'ing.</w>': 2, 'par': 2, 'gr': 0, 'gra': 0, 'gram': 2, 'me': 2, 'ai': 0, 'aims</w>': 1, 'this</w>': 1, 'su': 0, 'sub': 0, 'subj': 0, 'subjec': 0, 'subject</w>': 1, 'stu': 0, 'stude': 0, 'students</w>': 1, 'deve': 0, 'devel': 0, 'develop': 0, 'develop</w>': 1, 'an</w>': 1, 'un': 0, 'unde': 0, 'under': 0, 'underst': 0, 'understan': 0, 'understanding</w>': 1, 'main</w>': 1, 'alg': 0, 'algor': 0, 'algori': 0, 'algorith': 0, 'algorithms</w>': 1, 'nat': 1})\n",
      "Number of tokens: 130\n",
      "==========\n",
      "vocab,  {'The</w>': 2, 'aims</w>': 1, 'for</w>': 4, 'this</w>': 1, 'subject</w>': 1, 'is</w>': 2, 'students</w>': 1, 'to</w>': 2, 'develop</w>': 1, 'an</w>': 1, 'understanding</w>': 1, 'of</w>': 2, 'the</w>': 2, 'main</w>': 1, 'algorithms</w>': 1, 'used</w>': 2, 'in</w>': 3, 'nat u r al language</w>': 1, 'pro c e ss ing,</w>': 1, 'use</w>': 2, 'a </w>': 1, 'd i ver s e</w>': 1, 'r ang e</w>': 1, 'a p p l ication s</w>': 1, 'inclu ding</w>': 1, 't e x t</w>': 1, 'cl assi f ication ,</w>': 1, 'ma c h in e</w>': 1, 't r ans lation ,</w>': 1, 'and</w>': 3, 'q u e st ion </w>': 1, 'ans w e r ing.</w>': 1, 'T op ic s</w>': 1, 'b e</w>': 1, 'c o ver ed</w>': 1, 'inclu d e</w>': 1, 'par t - of - s p e ec h </w>': 1, 't ag g ing,</w>': 1, 'n - gram </w>': 1, 'language</w>': 2, 'm o de l l ing,</w>': 1, 's y nt a c t ic </w>': 1, 'par s ing</w>': 1, 'de e p </w>': 1, 'l e ar n ing.</w>': 1, 'pro gram m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h op s ,</w>': 1, 'assi g n me nts</w>': 1, 'in st al lation </w>': 1, 'at </w>': 1, 'h o me .</w>': 1}\n",
      "Iter: 99\n",
      "Best pair: ('nat', 'u')\n",
      "Tokens: defaultdict(<class 'int'>, {'T': 1, 'h': 4, 'e': 8, '</w>': 10, 'a': 3, 'i': 2, 'm': 4, 's': 7, 'f': 1, 'o': 3, 'r': 4, 't': 6, 'u': 1, 'b': 1, 'j': 0, 'c': 4, 'd': 2, 'n': 3, 'v': 0, 'l': 4, 'p': 4, 'g': 2, ',': 0, 'x': 1, 'q': 1, 'w': 2, '.': 0, '-': 3, 'y': 2, 'P': 1, 'k': 1, 'in': 3, 'e</w>': 7, 'an': 0, 's</w>': 3, 'ing': 0, 'or': 2, 'on': 2, 'at': 1, ',</w>': 4, 'd</w>': 0, 'ion': 1, 'for': 1, 'th': 1, 'de': 2, 'ation': 1, 'for</w>': 4, 'st': 2, 'ing</w>': 2, 'in</w>': 3, 'us': 0, 'ang': 1, 'ag': 1, 'ic': 2, 'is</w>': 2, 'nt': 1, 've': 0, 'op': 2, 'of': 1, 'al': 2, 'ed</w>': 1, 'lang': 0, 'langu': 0, 'languag': 0, 'language</w>': 3, 'ss': 1, 'ing,</w>': 3, 'cl': 1, 'and</w>': 3, '.</w>': 1, 'ar': 1, 'Th': 0, 'The</w>': 2, 'ms</w>': 0, 'ec': 1, 't</w>': 1, 'nts</w>': 1, 'to': 0, 'to</w>': 2, 'ding</w>': 1, 'of</w>': 2, 'the</w>': 2, 'ma': 1, 'used</w>': 2, 'pr': 0, 'pro': 2, 'use</w>': 2, 'ver': 2, 'ication': 2, 'incl': 0, 'inclu': 2, 'ass': 0, 'assi': 2, 'ans': 2, 'lation': 2, 'ing.</w>': 2, 'par': 2, 'gr': 0, 'gra': 0, 'gram': 2, 'me': 2, 'ai': 0, 'aims</w>': 1, 'this</w>': 1, 'su': 0, 'sub': 0, 'subj': 0, 'subjec': 0, 'subject</w>': 1, 'stu': 0, 'stude': 0, 'students</w>': 1, 'deve': 0, 'devel': 0, 'develop': 0, 'develop</w>': 1, 'an</w>': 1, 'un': 0, 'unde': 0, 'under': 0, 'underst': 0, 'understan': 0, 'understanding</w>': 1, 'main</w>': 1, 'alg': 0, 'algor': 0, 'algori': 0, 'algorith': 0, 'algorithms</w>': 1, 'nat': 0, 'natu': 1})\n",
      "Number of tokens: 131\n",
      "==========\n",
      "vocab,  {'The</w>': 2, 'aims</w>': 1, 'for</w>': 4, 'this</w>': 1, 'subject</w>': 1, 'is</w>': 2, 'students</w>': 1, 'to</w>': 2, 'develop</w>': 1, 'an</w>': 1, 'understanding</w>': 1, 'of</w>': 2, 'the</w>': 2, 'main</w>': 1, 'algorithms</w>': 1, 'used</w>': 2, 'in</w>': 3, 'natu r al language</w>': 1, 'pro c e ss ing,</w>': 1, 'use</w>': 2, 'a </w>': 1, 'd i ver s e</w>': 1, 'r ang e</w>': 1, 'a p p l ication s</w>': 1, 'inclu ding</w>': 1, 't e x t</w>': 1, 'cl assi f ication ,</w>': 1, 'ma c h in e</w>': 1, 't r ans lation ,</w>': 1, 'and</w>': 3, 'q u e st ion </w>': 1, 'ans w e r ing.</w>': 1, 'T op ic s</w>': 1, 'b e</w>': 1, 'c o ver ed</w>': 1, 'inclu d e</w>': 1, 'par t - of - s p e ec h </w>': 1, 't ag g ing,</w>': 1, 'n - gram </w>': 1, 'language</w>': 2, 'm o de l l ing,</w>': 1, 's y nt a c t ic </w>': 1, 'par s ing</w>': 1, 'de e p </w>': 1, 'l e ar n ing.</w>': 1, 'pro gram m ing</w>': 1, 'P y th on ,</w>': 1, 's e e</w>': 1, 'm or e</w>': 1, 'in for m ation </w>': 1, 'on </w>': 1, 'i t s</w>': 1, 'w or k s h op s ,</w>': 1, 'assi g n me nts</w>': 1, 'in st al lation </w>': 1, 'at </w>': 1, 'h o me .</w>': 1}\n"
     ]
    }
   ],
   "source": [
    "\"\"\"\n",
    "EXAMPLE:\n",
    "    word = 'T h e <\\w>'\n",
    "    pair = ('e', '<\\w>')\n",
    "    word_after_merge = 'T h e<\\w>'\n",
    "    \n",
    "输入:\n",
    "    pair: Tuple[str, str] # 需要合并的字符对\n",
    "    v_in: Dict[str, int]  # 合并前的vocab\n",
    "    \n",
    "输出:\n",
    "    v_out: Dict[str, int] # 合并后的vocab\n",
    "    \n",
    "注意:\n",
    "    当合并word 'Th e<\\w>'中的字符对 ('h', 'e')时，'Th'和'e<\\w>'字符对不能被合并。\n",
    "\"\"\"\n",
    "def merge_vocab(pair, v_in):\n",
    "    v_out = {}\n",
    "    # 把pair拆开，然后用空格合并起来，然后用\\把空格转义\n",
    "    bigram = re.escape(' '.join(pair))\n",
    "    # 自定义一个正则规则, (?<!\\S)h\\ e(?!\\S) 只有前面、后面不是非空白字符(\\S)(意思前后得是没东西的)，才匹配h\\ e，这样就可以把Th\\ e<\\w>排除在外\n",
    "    p = re.compile(r'(?<!\\S)' + bigram + r'(?!\\S)')\n",
    "    \n",
    "    for v in v_in:\n",
    "        # 遍历当前的vocabulary，找到匹配正则的v时，才用合并的pair去替换变成新的pair new，如果没有匹配上，那就保持原来的。\n",
    "        # 比如pair当前是'h'和'e'，然后遍历vocabulary，找到符合前后都没有东西只有'h\\ e'的时候就把他们并在一起变成'he'\n",
    "        new = p.sub(''.join(pair),v)\n",
    "        # 然后新的合并的数量就是当前vocabulary里面pair对应的数量\n",
    "        v_out[new] = v_in[v]\n",
    "    return v_out\n",
    "\n",
    "def get_tokens(vocab):\n",
    "    tokens = collections.defaultdict(int)\n",
    "    for word, freq in vocab.items():\n",
    "        word_tokens = word.split()\n",
    "        for token in word_tokens:\n",
    "            tokens[token] += freq\n",
    "    return tokens\n",
    "\n",
    "\n",
    "vocab = get_vocab(text)\n",
    "print(\"Vocab =\", vocab)\n",
    "print('==========')\n",
    "print('Tokens Before BPE')\n",
    "tokens = get_tokens(vocab)\n",
    "print('Tokens: {}'.format(tokens))\n",
    "print('Number of tokens: {}'.format(len(tokens)))\n",
    "print('==========')\n",
    "\n",
    "#about 100 merges we start to see common words\n",
    "num_merges = 100\n",
    "for i in range(num_merges):\n",
    "    pairs = get_stats(vocab)\n",
    "    if not pairs:\n",
    "        break\n",
    "    \n",
    "    # vocabulary里面pair出现次数最高的作为最先合并的pair\n",
    "    best = max(pairs, key=pairs.get)\n",
    "    \n",
    "    # 先给他合并了再说，当然这里不操作也没什么，到merge_vocab里面都一样\n",
    "    new_token = ''.join(best)\n",
    "    vocab = merge_vocab(best, vocab)\n",
    "    print('Iter: {}'.format(i))\n",
    "    print('Best pair: {}'.format(best))\n",
    "    # add new token to the vocab\n",
    "    tokens[new_token] = pairs[best]\n",
    "    # deduct frequency for tokens have been merged\n",
    "    tokens[best[0]] -= pairs[best]\n",
    "    tokens[best[1]] -= pairs[best]\n",
    "    print('Tokens: {}'.format(tokens))\n",
    "    print('Number of tokens: {}'.format(len(tokens)))\n",
    "    print('==========')\n",
    "    print('vocab, ', vocab)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "整个流程"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Tokens = ['understanding</w>', 'algorithms</w>', 'language</w>', 'students</w>', 'subject</w>', 'develop</w>', 'ication', 'lation', 'ing,</w>', 'used</w>', 'inclu', 'ing.</w>', 'aims</w>', 'this</w>', 'main</w>', 'ding</w>', 'ation', 'for</w>', 'and</w>', 'The</w>', 'the</w>', 'use</w>', 'assi', 'gram', 'ing</w>', 'natu', 'nts</w>', 'in</w>', 'is</w>', 'to</w>', 'of</w>', 'pro', 'ver', 'ans', 'par', 'an</w>', 'ang', 'ion', 'ed</w>', 'for', 'e</w>', ',</w>', 's</w>', 'in', 'al', 'st', 'op', 'ic', 'de', 'on', 'or', 'me', 'ss', 't</w>', 'cl', 'ma', 'of', 'ec', 'ag', 'nt', 'ar', 'th', 'at', '.</w>', '</w>', 'e', 's', 't', 'r', 'c', 'p', 'l', 'h', 'm', 'a', 'o', '-', 'n', 'd', 'i', 'w', 'g', 'y', 'x', 'f', 'q', 'u', 'T', 'b', 'P', 'k'] \n",
      "\n",
      "==========\n",
      "Sentence = I like natural language processing!\n",
      "Tokenizing word: I</w>...\n",
      "['</u>', '</w>']\n",
      "Tokenizing word: like</w>...\n",
      "['l', 'i', 'k', 'e</w>']\n",
      "Tokenizing word: natural</w>...\n",
      "['natu', 'r', 'al', '</w>']\n",
      "Tokenizing word: language</w>...\n",
      "['language</w>']\n",
      "Tokenizing word: processing!</w>...\n",
      "['pro', 'c', 'e', 'ss', 'in', 'g', '</u>', '</w>']\n",
      "==========\n",
      "Sentence = I like natural languaaage processing!\n",
      "Tokenizing word: I</w>...\n",
      "['</u>', '</w>']\n",
      "Tokenizing word: like</w>...\n",
      "['l', 'i', 'k', 'e</w>']\n",
      "Tokenizing word: natural</w>...\n",
      "['natu', 'r', 'al', '</w>']\n",
      "Tokenizing word: languaaage</w>...\n",
      "['l', 'ang', 'u', 'a', 'a', 'ag', 'e</w>']\n",
      "Tokenizing word: processing!</w>...\n",
      "['pro', 'c', 'e', 'ss', 'in', 'g', '</u>', '</w>']\n"
     ]
    }
   ],
   "source": [
    "def get_tokens_from_vocab(vocab):\n",
    "    tokens_frequencies = collections.defaultdict(int)\n",
    "    vocab_tokenization = {}\n",
    "    for word, freq in vocab.items():\n",
    "        # 看vocabulary里面的token频率，相当于上面的code中的tokens去除freq为0的\n",
    "        word_tokens = word.split()\n",
    "        for token in word_tokens:\n",
    "            tokens_frequencies[token] += freq\n",
    "        # vocab和其对应的tokens\n",
    "        vocab_tokenization[''.join(word_tokens)] = word_tokens\n",
    "    return tokens_frequencies, vocab_tokenization\n",
    "\n",
    "def measure_token_length(token):\n",
    "    \n",
    "    # 如果token最后四个元素是 < / w >\n",
    "    if token[-4:] == '</w>':\n",
    "        # 那就返回除了最后四个之外的长度再加上1(结尾)\n",
    "        return len(token[:-4]) + 1\n",
    "    else:\n",
    "        # 如果这个token里面没有结尾就直接返回当前长度\n",
    "        return len(token)\n",
    "    \n",
    "# 如果vocabulary里面找不到要拆分的词，就根据已经有的token现拆\n",
    "def tokenize_word(string, sorted_tokens, unknown_token='</u>'):\n",
    "    \n",
    "    # base case，没词进来了，那拆的结果就是空的\n",
    "    if string == '':\n",
    "        return []\n",
    "    # 已有的sorted tokens没有了，那就真的没这个词了\n",
    "    if sorted_tokens == []:\n",
    "        return [unknown_token] * len(string)\n",
    "\n",
    "    # 记录拆分结果\n",
    "    string_tokens = []\n",
    "    \n",
    "    # iterate over all tokens to find match\n",
    "    for i in range(len(sorted_tokens)):\n",
    "        token = sorted_tokens[i]\n",
    "        \n",
    "        # 自定义一个正则，然后要把token里面包含句号的变成[.]\n",
    "        token_reg = re.escape(token.replace('.', '[.]'))\n",
    "        \n",
    "        # 在当前string里面遍历，找到每一个match token的开始和结束位置，比如string=good，然后token是o，输出[(2,2),(3,3)]?\n",
    "        matched_positions = [(m.start(0), m.end(0)) for m in re.finditer(token_reg, string)]\n",
    "        # if no match found in the string, go to next token\n",
    "        if len(matched_positions) == 0:\n",
    "            continue\n",
    "        # 因为要拆分这个词，匹配上的token把这个word拆开了，那就要拿到除了match部分之外的substring，所以这里要拿match的start\n",
    "        substring_end_positions = [matched_position[0] for matched_position in matched_positions]\n",
    "        substring_start_position = 0\n",
    "        \n",
    "        \n",
    "        # 如果有匹配成功的话，就会进入这个循环\n",
    "        for substring_end_position in substring_end_positions:\n",
    "            # slice for sub-word\n",
    "            substring = string[substring_start_position:substring_end_position]\n",
    "            # tokenize this sub-word with tokens remaining 接着用substring匹配剩余的sorted token，因为刚就匹配了一个\n",
    "            string_tokens += tokenize_word(string=substring, sorted_tokens=sorted_tokens[i+1:], unknown_token=unknown_token)\n",
    "            # 先把sorted token里面匹配上的记下来\n",
    "            string_tokens += [token]\n",
    "            substring_start_position = substring_end_position + len(token)\n",
    "        # tokenize the remaining string 去除前头的substring，去除已经匹配上的，后面还剩下substring_start_pos到结束的一段substring没看\n",
    "        remaining_substring = string[substring_start_position:]\n",
    "        # 接着匹配\n",
    "        string_tokens += tokenize_word(string=remaining_substring, sorted_tokens=sorted_tokens[i+1:], unknown_token=unknown_token)\n",
    "        break\n",
    "    else:\n",
    "        # return list of unknown token if no match is found for the string\n",
    "        string_tokens = [unknown_token] * len(string)\n",
    "        \n",
    "    return string_tokens\n",
    "\n",
    "\"\"\"\n",
    "该函数生成一个所有标记的列表，按其长度（第一键）和频率（第二键）排序。\n",
    "\n",
    "EXAMPLE:\n",
    "    token frequency dictionary before sorting: {'natural': 3, 'language':2, 'processing': 4, 'lecture': 4}\n",
    "    sorted tokens: ['processing', 'language', 'lecture', 'natural']\n",
    "    \n",
    "INPUT:\n",
    "    token_frequencies: Dict[str, int] # Counter for token frequency\n",
    "    \n",
    "OUTPUT:\n",
    "    sorted_token: List[str] # Tokens sorted by length and frequency\n",
    "\n",
    "\"\"\"\n",
    "def sort_tokens(tokens_frequencies):\n",
    "    # 对 token_frequencies里面的东西，先进行长度排序，再进行频次，sorted是从低到高所以要reverse\n",
    "    sorted_tokens_tuple = sorted(tokens_frequencies.items(), key=lambda item:(measure_token_length(item[0]),item[1]), reverse=True)\n",
    "    \n",
    "    # 然后只要tokens不要频次\n",
    "    sorted_tokens = [token for (token, freq) in sorted_tokens_tuple]\n",
    "\n",
    "    return sorted_tokens\n",
    "\n",
    "#display the vocab\n",
    "tokens_frequencies, vocab_tokenization = get_tokens_from_vocab(vocab)\n",
    "\n",
    "#sort tokens by length and frequency\n",
    "sorted_tokens = sort_tokens(tokens_frequencies)\n",
    "print(\"Tokens =\", sorted_tokens, \"\\n\")\n",
    "\n",
    "#print(\"vocab tokenization: \", vocab_tokenization)\n",
    "\n",
    "sentence_1 = 'I like natural language processing!'\n",
    "sentence_2 = 'I like natural languaaage processing!'\n",
    "sentence_list = [sentence_1, sentence_2]\n",
    "\n",
    "for sentence in sentence_list:\n",
    "    \n",
    "    print('==========')\n",
    "    print(\"Sentence =\", sentence)\n",
    "    \n",
    "    for word in sentence.split():\n",
    "        word = word + \"</w>\"\n",
    "\n",
    "        print('Tokenizing word: {}...'.format(word))\n",
    "        if word in vocab_tokenization:\n",
    "            print(vocab_tokenization[word])\n",
    "        else:\n",
    "            print(tokenize_word(string=word, sorted_tokens=sorted_tokens, unknown_token='</u>'))\n",
    "\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "science39",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.18"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
